mirror of https://github.com/deuill/farsight.git
Implement more destination types, attr selector handling
This commit is contained in:
parent
63a18b34b1
commit
130da67f3f
39
farsight.go
39
farsight.go
|
@ -8,6 +8,8 @@ import (
|
|||
// Standard library.
|
||||
"fmt"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
// Internal packages.
|
||||
"github.com/deuill/farsight/parser"
|
||||
|
@ -18,6 +20,11 @@ import (
|
|||
_ "github.com/deuill/farsight/source/http"
|
||||
)
|
||||
|
||||
var (
|
||||
regexpValidInt = `[-+]?[0-9]+`
|
||||
regexpValidFloat = `[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?`
|
||||
)
|
||||
|
||||
// Fetch data from source pointed to by URI in `src`, and store to arbitrary
|
||||
// struct pointed to by `dest`. Data is parsed according to `kind`, and has to
|
||||
// correspond to a registered parser.
|
||||
|
@ -86,6 +93,36 @@ func setField(doc parser.Document, field reflect.Value) error {
|
|||
switch field.Kind() {
|
||||
case reflect.String:
|
||||
field.SetString(val)
|
||||
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
|
||||
if val == "" {
|
||||
field.SetInt(0)
|
||||
} else {
|
||||
// Truncate string to valid integer.
|
||||
val = regexp.MustCompile(regexpValidInt).FindString(val)
|
||||
|
||||
// Parse string and return integer value.
|
||||
num, err := strconv.ParseInt(val, 10, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
field.SetInt(num)
|
||||
}
|
||||
case reflect.Float32, reflect.Float64:
|
||||
if val == "" {
|
||||
field.SetFloat(0)
|
||||
} else {
|
||||
// Truncate string to valid floating point number.
|
||||
val = regexp.MustCompile(regexpValidFloat).FindString(val)
|
||||
|
||||
// Parse string and return floating point value.
|
||||
num, err := strconv.ParseFloat(val, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
field.SetFloat(num)
|
||||
}
|
||||
case reflect.Slice:
|
||||
// Decompose document into list and prepare destination slice.
|
||||
list := doc.List()
|
||||
|
@ -93,7 +130,7 @@ func setField(doc parser.Document, field reflect.Value) error {
|
|||
|
||||
for i, d := range list {
|
||||
if err := setField(d, slice.Index(i)); err != nil {
|
||||
return nil
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
// Internal packages.
|
||||
"github.com/deuill/farsight/parser"
|
||||
|
@ -41,19 +42,48 @@ type HTMLDocument struct {
|
|||
// the provided CSS selector. On success, a new document is returned, containing
|
||||
// a list of all matched elements. An error is returned if the CSS selector is
|
||||
// malformed, or no elements were matched.
|
||||
func (h *HTMLDocument) Filter(attr string) (parser.Document, error) {
|
||||
sel, err := cascadia.Compile(attr)
|
||||
func (h *HTMLDocument) Filter(sel string) (parser.Document, error) {
|
||||
var attr string
|
||||
|
||||
// Parse optional attribute selector.
|
||||
idx := strings.LastIndex(sel, "/")
|
||||
if idx > 0 {
|
||||
attr = sel[(idx + 1):]
|
||||
sel = sel[:idx]
|
||||
}
|
||||
|
||||
s, err := cascadia.Compile(sel)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
sub := &HTMLDocument{nodes: []*html.Node{}}
|
||||
for _, n := range h.nodes {
|
||||
sub.nodes = append(sub.nodes, sel.MatchAll(n)...)
|
||||
sub.nodes = append(sub.nodes, s.MatchAll(n)...)
|
||||
}
|
||||
|
||||
if len(sub.nodes) == 0 {
|
||||
return nil, fmt.Errorf("Attribute '%s' matched no elements", attr)
|
||||
return nil, fmt.Errorf("Selector '%s' matched no elements", sel)
|
||||
}
|
||||
|
||||
// Loop through node attributes and attempt to match requested attribute.
|
||||
// If a matching attribute key is matched, replace current node with a
|
||||
// TextNode containing only the attribute value.
|
||||
if attr != "" {
|
||||
for i, n := range sub.nodes {
|
||||
var found bool
|
||||
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == attr {
|
||||
sub.nodes[i] = &html.Node{Type: html.TextNode, Data: a.Val}
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return nil, fmt.Errorf("Unable to find attribute '%s' for selector '%s'", attr, sel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sub, nil
|
||||
|
@ -96,7 +126,7 @@ func getNodeText(n *html.Node) string {
|
|||
buf.WriteString(getNodeText(c))
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
return strings.TrimSpace(buf.String())
|
||||
}
|
||||
|
||||
return ""
|
||||
|
|
Loading…
Reference in New Issue