1
0
Fork 0

Implement working support for scraping HTML

This implements basic working support for fetching HTML documents
via HTTP/HTTPS and scraping the resulting markup via CSS selectors.

Support for generic sources and parsers has also been implemented,
allowing future support for fetching from, e.g. local files, and
scraping arbitrary file formats, such as JSON, YAML etc.

Basic tests have been implemented. Documentation is severely lacking.
This commit is contained in:
Alex Palaistras 2016-05-26 01:42:24 +01:00
parent 6ff1c8cf64
commit a3570358ed
6 changed files with 356 additions and 0 deletions

90
farsight.go Normal file
View File

@ -0,0 +1,90 @@
package farsight
import (
// Standard library.
"fmt"
"reflect"
// Internal packages.
"github.com/deuill/farsight/parser"
"github.com/deuill/farsight/source"
// Pre-defined sources and parsers.
_ "github.com/deuill/farsight/parser/html"
_ "github.com/deuill/farsight/source/http"
)
// Fetch data from source pointed to by URI in `src`, and store to arbitrary
// struct pointed to by `dest`.
func Fetch(src string, dest interface{}, kind string) error {
// Verify destination value type.
val := reflect.ValueOf(dest)
if val.Kind() != reflect.Ptr && val.Elem().Kind() != reflect.Struct {
return fmt.Errorf("Invalid destination type '%s', expected 'ptr'", val.Kind().String())
}
// Fetch data from source defined in `src`.
buf, err := source.Fetch(src)
if err != nil {
return err
}
// Parse raw data and return parsed document.
doc, err := parser.Parse(kind, buf)
if err != nil {
return err
}
// Populate destination fields from parsed document.
if err = populateStruct(doc, val.Elem()); err != nil {
return err
}
return nil
}
// Set struct fields sequentially according to their `farsight` tags.
func populateStruct(doc parser.Document, dest reflect.Value) error {
// Set each struct field in sequence.
for i := 0; i < dest.NumField(); i++ {
f := dest.Field(i)
ft := dest.Type().Field(i)
// Skip field if `farsight` tag is unset or explicitly ignored.
attr := ft.Tag.Get("farsight")
if attr == "" || attr == "-" {
continue
}
// Filter document by tag and set field.
subdoc, err := doc.Filter(attr)
if err != nil {
return err
}
if err = setField(subdoc, f); err != nil {
return err
}
}
return nil
}
// Set struct field for concrete value contained within `doc`.
func setField(doc parser.Document, field reflect.Value) error {
// Get string value from document.
val := doc.String()
// Determine field type and set value, converting if necessary.
switch field.Kind() {
case reflect.String:
field.SetString(val)
case reflect.Struct:
return populateStruct(doc, field)
default:
return fmt.Errorf("Unable to set unknown field type '%s'", field.Kind().String())
}
return nil
}

62
farsight_test.go Normal file
View File

@ -0,0 +1,62 @@
package farsight
import (
// Standard library.
"fmt"
"io"
"reflect"
"strings"
"testing"
// Internal packages.
"github.com/deuill/farsight/source"
)
type TestSource struct {
data map[string]TestCase
}
func (t *TestSource) Fetch(src string) (io.Reader, error) {
if _, exists := t.data[src]; !exists {
return nil, fmt.Errorf("Unknown source data requested")
}
return strings.NewReader(t.data[src].Content), nil
}
type TestCase struct {
Content string
Actual interface{}
Expected interface{}
}
var fetchTests = map[string]TestCase{
// Fetch and set ID attribute.
"html://id-test": {
`<html><div id="hello">Hello World</div></html>`,
&struct {
Hello string `farsight:"#hello"`
}{},
&struct {
Hello string `farsight:"#hello"`
}{
"Hello World",
},
},
}
func TestFetch(t *testing.T) {
// Register mock source.
source.Register("html", &TestSource{data: fetchTests})
// Execute tests sequentially.
for k, v := range fetchTests {
if err := Fetch(k, v.Actual, "html"); err != nil {
t.Errorf("Fetch failed for '%s': %s", k, err)
}
if reflect.DeepEqual(v.Actual, v.Expected) == false {
t.Errorf("Testing '%s' failed: expected '%v', actual '%v'\n", k, v.Expected, v.Actual)
}
}
}

76
parser/html/html.go Normal file
View File

@ -0,0 +1,76 @@
package html
import (
// Standard library.
"bytes"
"io"
// Internal packages.
"github.com/deuill/farsight/parser"
// Third-party packages.
"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
)
type HTMLParser struct{}
func (h *HTMLParser) Parse(r io.Reader) (parser.Document, error) {
doc, err := html.Parse(r)
if err != nil {
return nil, err
}
return &HTMLDocument{nodes: []*html.Node{doc}}, nil
}
type HTMLDocument struct {
nodes []*html.Node
}
func (h *HTMLDocument) Filter(attr string) (parser.Document, error) {
sel, err := cascadia.Compile(attr)
if err != nil {
return nil, err
}
sub := &HTMLDocument{nodes: []*html.Node{}}
for _, n := range h.nodes {
sub.nodes = append(sub.nodes, sel.MatchAll(n)...)
}
return sub, nil
}
func (h *HTMLDocument) String() string {
var buf bytes.Buffer
for _, n := range h.nodes {
buf.WriteString(getNodeText(n))
}
return buf.String()
}
func getNodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
if n.FirstChild != nil {
var buf bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
buf.WriteString(getNodeText(c))
}
return buf.String()
}
return ""
}
func init() {
// Register HTML language parser for later use.
parser.Register("html", &HTMLParser{})
}

44
parser/parser.go Normal file
View File

@ -0,0 +1,44 @@
package parser
import (
// Standard library.
"fmt"
"io"
)
type Parser interface {
Parse(io.Reader) (Document, error)
}
type Document interface {
Filter(attr string) (Document, error)
String() string
}
// A map of all registered parsers.
var parsers map[string]Parser
// Register a parser under a unique name.
func Register(name string, rcvr Parser) error {
if _, exists := parsers[name]; exists {
return fmt.Errorf("Parser '%s' already registered, refusing to overwrite", name)
}
parsers[name] = rcvr
return nil
}
// Read and parse document by calling the appropriate concrete parser for the kind
// passed. Returns a parsed document, which can then be queried against, or an
// error if parsing fails.
func Parse(kind string, src io.Reader) (Document, error) {
if _, exists := parsers[kind]; !exists {
return nil, fmt.Errorf("Parser for '%s' not found", kind)
}
return parsers[kind].Parse(src)
}
func init() {
parsers = make(map[string]Parser)
}

41
source/http/http.go Normal file
View File

@ -0,0 +1,41 @@
package http
import (
// Standard library.
"bytes"
"io"
"net/http"
// Internal packages.
"github.com/deuill/farsight/source"
)
type HTTP struct{}
func (h *HTTP) Fetch(src string) (io.Reader, error) {
// Attempt to fetch resource from source endpoint.
resp, err := http.Get(src)
if err != nil {
return nil, err
}
defer resp.Body.Close()
var buffer bytes.Buffer
// Fetch and copy body content locally. This incurs some extra overhead, but
// avoids having to pass responsibility for closing the Reader to the caller.
_, err = buffer.ReadFrom(resp.Body)
if err != nil {
return nil, err
}
return &buffer, nil
}
func init() {
h := &HTTP{}
// Register HTTP source for both "http" and "https" endpoints.
source.Register("http", h)
source.Register("https", h)
}

43
source/source.go Normal file
View File

@ -0,0 +1,43 @@
package source
import (
// Standard library.
"fmt"
"io"
"strings"
)
type Source interface {
Fetch(src string) (io.Reader, error)
}
// A map of all registered sources.
var sources map[string]Source
// Register a source under a unique name.
func Register(name string, rcvr Source) error {
if _, exists := sources[name]; exists {
return fmt.Errorf("Source '%s' already registered, refusing to overwrite", name)
}
sources[name] = rcvr
return nil
}
// Fetch resource, calling the appropriate source handler.
func Fetch(src string) (io.Reader, error) {
fields := strings.Split(src, ":")
if len(fields) < 2 {
return nil, fmt.Errorf("Failed to parse source URL '%s'", src)
}
if _, exists := sources[fields[0]]; !exists {
return nil, fmt.Errorf("Source scheme '%s' does not match a registered source", fields[0])
}
return sources[fields[0]].Fetch(src)
}
func init() {
sources = make(map[string]Source)
}