1
0
Fork 0
farsight/parser/html/html.go

139 lines
3.3 KiB
Go

// Copyright 2016 Alexander Palaistras. All rights reserved.
// Use of this source code is governed by the MIT license that can be found in
// the LICENSE file.
package html
import (
// Standard library.
"bytes"
"fmt"
"io"
"strings"
// Internal packages.
"github.com/deuill/farsight/parser"
// Third-party packages.
"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
)
// HTMLParser represents a parser and tokeniser for HTML documents.
type HTMLParser struct{}
// Parse reads an HTML document from the reader passed, and returns a document
// containing a single parent node. An error is returned if parsing fails.
func (h *HTMLParser) Parse(r io.Reader) (parser.Document, error) {
n, err := html.Parse(r)
if err != nil {
return nil, err
}
return &HTMLDocument{nodes: []*html.Node{n}}, nil
}
// HTMLDocument represents a collection of nodes under a single parent container.
type HTMLDocument struct {
nodes []*html.Node
}
// Filter traverses the document tree and attempts to match elements against
// the provided CSS selector. On success, a new document is returned, containing
// a list of all matched elements. An error is returned if the CSS selector is
// malformed, or no elements were matched.
func (h *HTMLDocument) Filter(sel string) (parser.Document, error) {
var attr string
// Parse optional attribute selector.
idx := strings.LastIndex(sel, "/")
if idx > 0 {
attr = sel[(idx + 1):]
sel = sel[:idx]
}
s, err := cascadia.Compile(sel)
if err != nil {
return nil, err
}
sub := &HTMLDocument{nodes: []*html.Node{}}
for _, n := range h.nodes {
sub.nodes = append(sub.nodes, s.MatchAll(n)...)
}
if len(sub.nodes) == 0 {
return nil, fmt.Errorf("Selector '%s' matched no elements", sel)
}
// Loop through node attributes and attempt to match requested attribute.
// If a matching attribute key is matched, replace current node with a
// TextNode containing only the attribute value.
if attr != "" {
for i, n := range sub.nodes {
var found bool
for _, a := range n.Attr {
if a.Key == attr {
sub.nodes[i] = &html.Node{Type: html.TextNode, Data: a.Val}
found = true
}
}
if !found {
return nil, fmt.Errorf("Unable to find attribute '%s' for selector '%s'", attr, sel)
}
}
}
return sub, nil
}
// Slice decomposes the target HTMLDocument into a slice of HTMLDocument types,
// each containing a single node from the parent's list of nodes.
func (h *HTMLDocument) Slice() []parser.Document {
var docs []parser.Document
for _, n := range h.nodes {
docs = append(docs, &HTMLDocument{nodes: []*html.Node{n}})
}
return docs
}
// Returns the document contents by traversing the tree and concatenating all
// data contained within text nodes.
func (h *HTMLDocument) String() string {
var buf bytes.Buffer
for _, n := range h.nodes {
buf.WriteString(getNodeText(n))
}
return buf.String()
}
// Traverse document tree and return the first text node's contents as a string.
func getNodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
if n.FirstChild != nil {
var buf bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
buf.WriteString(getNodeText(c))
}
return strings.TrimSpace(buf.String())
}
return ""
}
func init() {
// Register HTML language parser for later use.
parser.Register("html", &HTMLParser{})
}