mirror of https://github.com/deuill/farsight.git
Improve documentation, add correct handling for slices
This commit is contained in:
parent
92fb4d6e25
commit
afe72b7d47
18
farsight.go
18
farsight.go
|
@ -19,7 +19,8 @@ import (
|
|||
)
|
||||
|
||||
// Fetch data from source pointed to by URI in `src`, and store to arbitrary
|
||||
// struct pointed to by `dest`.
|
||||
// struct pointed to by `dest`. Data is parsed according to `kind`, and has to
|
||||
// correspond to a registered parser.
|
||||
func Fetch(src string, dest interface{}, kind string) error {
|
||||
// Verify destination value type.
|
||||
val := reflect.ValueOf(dest)
|
||||
|
@ -48,7 +49,8 @@ func Fetch(src string, dest interface{}, kind string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Set struct fields sequentially according to their `farsight` tags.
|
||||
// Set struct fields from document, filtered by tags marked by "farsight"
|
||||
// definitions.
|
||||
func populateStruct(doc parser.Document, dest reflect.Value) error {
|
||||
// Set each struct field in sequence.
|
||||
for i := 0; i < dest.NumField(); i++ {
|
||||
|
@ -84,6 +86,18 @@ func setField(doc parser.Document, field reflect.Value) error {
|
|||
switch field.Kind() {
|
||||
case reflect.String:
|
||||
field.SetString(val)
|
||||
case reflect.Slice:
|
||||
// Decompose document into list and prepare destination slice.
|
||||
list := doc.List()
|
||||
slice := reflect.MakeSlice(field.Type(), len(list), cap(list))
|
||||
|
||||
for i, d := range list {
|
||||
if err := setField(d, slice.Index(i)); err != nil {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
field.Set(slice)
|
||||
case reflect.Struct:
|
||||
return populateStruct(doc, field)
|
||||
default:
|
||||
|
|
|
@ -34,19 +34,30 @@ type TestCase struct {
|
|||
Expected interface{}
|
||||
}
|
||||
|
||||
// Test cases for `farsight.Fetch` function.
|
||||
var fetchTests = map[string]TestCase{
|
||||
// Fetch and set ID attribute.
|
||||
"html://id-test": {
|
||||
"html://string": {
|
||||
`<html><div id="hello">Hello World</div></html>`,
|
||||
&struct {
|
||||
Hello string `farsight:"#hello"`
|
||||
Text string `farsight:"#hello"`
|
||||
}{},
|
||||
&struct {
|
||||
Hello string `farsight:"#hello"`
|
||||
Text string `farsight:"#hello"`
|
||||
}{
|
||||
"Hello World",
|
||||
},
|
||||
},
|
||||
"html://slice": {
|
||||
`<body><ul id="g"><li>Hello</li><li>World</li></ul></body>`,
|
||||
&struct {
|
||||
List []string `farsight:"#g li"`
|
||||
}{},
|
||||
&struct {
|
||||
List []string `farsight:"#g li"`
|
||||
}{
|
||||
[]string{"Hello", "World"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestFetch(t *testing.T) {
|
||||
|
|
|
@ -7,6 +7,7 @@ package html
|
|||
import (
|
||||
// Standard library.
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
// Internal packages.
|
||||
|
@ -17,21 +18,29 @@ import (
|
|||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// HTMLParser represents a parser and tokeniser for HTML documents.
|
||||
type HTMLParser struct{}
|
||||
|
||||
// Parse reads an HTML document from the reader passed, and returns a document
|
||||
// containing a single parent node. An error is returned if parsing fails.
|
||||
func (h *HTMLParser) Parse(r io.Reader) (parser.Document, error) {
|
||||
doc, err := html.Parse(r)
|
||||
n, err := html.Parse(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &HTMLDocument{nodes: []*html.Node{doc}}, nil
|
||||
return &HTMLDocument{nodes: []*html.Node{n}}, nil
|
||||
}
|
||||
|
||||
// HTMLDocument represents a collection of nodes under a single parent container.
|
||||
type HTMLDocument struct {
|
||||
nodes []*html.Node
|
||||
}
|
||||
|
||||
// Filter traverses the document tree and attempts to match elements against
|
||||
// the provided CSS selector. On success, a new document is returned, containing
|
||||
// a list of all matched elements. An error is returned if the CSS selector is
|
||||
// malformed, or no elements were matched.
|
||||
func (h *HTMLDocument) Filter(attr string) (parser.Document, error) {
|
||||
sel, err := cascadia.Compile(attr)
|
||||
if err != nil {
|
||||
|
@ -43,9 +52,27 @@ func (h *HTMLDocument) Filter(attr string) (parser.Document, error) {
|
|||
sub.nodes = append(sub.nodes, sel.MatchAll(n)...)
|
||||
}
|
||||
|
||||
if len(sub.nodes) == 0 {
|
||||
return nil, fmt.Errorf("Attribute '%s' matched no elements", attr)
|
||||
}
|
||||
|
||||
return sub, nil
|
||||
}
|
||||
|
||||
// List decomposes the target HTMLDocument into a slice of HTMLDocument types,
|
||||
// each containing a single node from the parent's list of nodes.
|
||||
func (h *HTMLDocument) List() []parser.Document {
|
||||
var docs []parser.Document
|
||||
|
||||
for _, n := range h.nodes {
|
||||
docs = append(docs, &HTMLDocument{nodes: []*html.Node{n}})
|
||||
}
|
||||
|
||||
return docs
|
||||
}
|
||||
|
||||
// Returns the document contents by traversing the tree and concatenating all
|
||||
// data contained within text nodes.
|
||||
func (h *HTMLDocument) String() string {
|
||||
var buf bytes.Buffer
|
||||
|
||||
|
@ -56,6 +83,7 @@ func (h *HTMLDocument) String() string {
|
|||
return buf.String()
|
||||
}
|
||||
|
||||
// Traverse document tree and return the first text node's contents as a string.
|
||||
func getNodeText(n *html.Node) string {
|
||||
if n.Type == html.TextNode {
|
||||
return n.Data
|
||||
|
|
|
@ -16,6 +16,7 @@ type Parser interface {
|
|||
|
||||
type Document interface {
|
||||
Filter(attr string) (Document, error)
|
||||
List() []Document
|
||||
String() string
|
||||
}
|
||||
|
||||
|
|
|
@ -14,9 +14,12 @@ import (
|
|||
"github.com/deuill/farsight/source"
|
||||
)
|
||||
|
||||
type HTTP struct{}
|
||||
// HTTPSource represents a source for HTTP and HTTPS endpoints.
|
||||
type HTTPSource struct{}
|
||||
|
||||
func (h *HTTP) Fetch(src string) (io.Reader, error) {
|
||||
// Fetch issues a GET request against the source URL pointed to by `src`, and
|
||||
// returns an io.Reader for the containing HTML document.
|
||||
func (h *HTTPSource) Fetch(src string) (io.Reader, error) {
|
||||
// Attempt to fetch resource from source endpoint.
|
||||
resp, err := http.Get(src)
|
||||
if err != nil {
|
||||
|
@ -37,7 +40,7 @@ func (h *HTTP) Fetch(src string) (io.Reader, error) {
|
|||
}
|
||||
|
||||
func init() {
|
||||
h := &HTTP{}
|
||||
h := &HTTPSource{}
|
||||
|
||||
// Register HTTP source for both "http" and "https" endpoints.
|
||||
source.Register("http", h)
|
||||
|
|
Loading…
Reference in New Issue