1
0
Fork 0

Implement more destination types, attr selector handling

This commit is contained in:
Alex Palaistras 2016-05-27 02:18:27 +01:00
parent 63a18b34b1
commit 130da67f3f
2 changed files with 73 additions and 6 deletions

View File

@ -8,6 +8,8 @@ import (
// Standard library.
"fmt"
"reflect"
"regexp"
"strconv"
// Internal packages.
"github.com/deuill/farsight/parser"
@ -18,6 +20,11 @@ import (
_ "github.com/deuill/farsight/source/http"
)
var (
regexpValidInt = `[-+]?[0-9]+`
regexpValidFloat = `[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?`
)
// Fetch data from source pointed to by URI in `src`, and store to arbitrary
// struct pointed to by `dest`. Data is parsed according to `kind`, and has to
// correspond to a registered parser.
@ -86,6 +93,36 @@ func setField(doc parser.Document, field reflect.Value) error {
switch field.Kind() {
case reflect.String:
field.SetString(val)
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
if val == "" {
field.SetInt(0)
} else {
// Truncate string to valid integer.
val = regexp.MustCompile(regexpValidInt).FindString(val)
// Parse string and return integer value.
num, err := strconv.ParseInt(val, 10, 64)
if err != nil {
return err
}
field.SetInt(num)
}
case reflect.Float32, reflect.Float64:
if val == "" {
field.SetFloat(0)
} else {
// Truncate string to valid floating point number.
val = regexp.MustCompile(regexpValidFloat).FindString(val)
// Parse string and return floating point value.
num, err := strconv.ParseFloat(val, 64)
if err != nil {
return err
}
field.SetFloat(num)
}
case reflect.Slice:
// Decompose document into list and prepare destination slice.
list := doc.List()
@ -93,7 +130,7 @@ func setField(doc parser.Document, field reflect.Value) error {
for i, d := range list {
if err := setField(d, slice.Index(i)); err != nil {
return nil
return err
}
}

View File

@ -9,6 +9,7 @@ import (
"bytes"
"fmt"
"io"
"strings"
// Internal packages.
"github.com/deuill/farsight/parser"
@ -41,19 +42,48 @@ type HTMLDocument struct {
// the provided CSS selector. On success, a new document is returned, containing
// a list of all matched elements. An error is returned if the CSS selector is
// malformed, or no elements were matched.
func (h *HTMLDocument) Filter(attr string) (parser.Document, error) {
sel, err := cascadia.Compile(attr)
func (h *HTMLDocument) Filter(sel string) (parser.Document, error) {
var attr string
// Parse optional attribute selector.
idx := strings.LastIndex(sel, "/")
if idx > 0 {
attr = sel[(idx + 1):]
sel = sel[:idx]
}
s, err := cascadia.Compile(sel)
if err != nil {
return nil, err
}
sub := &HTMLDocument{nodes: []*html.Node{}}
for _, n := range h.nodes {
sub.nodes = append(sub.nodes, sel.MatchAll(n)...)
sub.nodes = append(sub.nodes, s.MatchAll(n)...)
}
if len(sub.nodes) == 0 {
return nil, fmt.Errorf("Attribute '%s' matched no elements", attr)
return nil, fmt.Errorf("Selector '%s' matched no elements", sel)
}
// Loop through node attributes and attempt to match requested attribute.
// If a matching attribute key is matched, replace current node with a
// TextNode containing only the attribute value.
if attr != "" {
for i, n := range sub.nodes {
var found bool
for _, a := range n.Attr {
if a.Key == attr {
sub.nodes[i] = &html.Node{Type: html.TextNode, Data: a.Val}
found = true
}
}
if !found {
return nil, fmt.Errorf("Unable to find attribute '%s' for selector '%s'", attr, sel)
}
}
}
return sub, nil
@ -96,7 +126,7 @@ func getNodeText(n *html.Node) string {
buf.WriteString(getNodeText(c))
}
return buf.String()
return strings.TrimSpace(buf.String())
}
return ""