scrapegoat

Scraper library written in Go.

Usage

package main

import (
	"fmt"

	sg "github.com/hosiawak/scrapegoat"
	"github.com/PuerkitoBio/goquery"
)

// Create a struct to hold your parsed item
type Product struct {
	website, name, description string
}

// Create an initialization function for your item
func NewProduct() sg.Item {
	return &Product{website: "amazon.com"}
}

// Define the parsing function Parse for your item
// resp.Body is io.Reader you can use to parse the page
// For example you can use goquery
func (p *Product) Parse(resp *sg.Response, ctx sg.Context) (sg.Item, error) {
	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		return nil, err
	}
	p.name = doc.Find("title").Text()
	p.description = doc.Find(".productDescriptionWrapper").Text()
	return p, nil
}

func main() {

	// Create a channel on which you'll receive *sg.Response:
	results := make(chan *sg.Response)

	// Create a spider and pass the channel
	spider := sg.NewSpider("amazon.com", results)

	// Register the init function at the spider
	spider.NewItemFunc = NewProduct

	// Start the spider
	spider.Start()

	// Enqueue some URLs
	spider.EnqueueURL("http://www.amazon.com/Apple-iPod-classic-Black-Generation/dp/B001F7AHOG")

	// Collect the result
	// This blocks waiting for results so you may want to do it in a goroutine
	res := <-results

	// Need to type assert your struct because on the channel can hold any value (scrapegoat.Item is interface{})
	if product, ok := res.Item.(*Product); ok {
		fmt.Printf("Product Name: %s\nDescription: %s\n", product.name, product.description)
	} else {
		panic("Assertion failed")
	}

	// to stop the spider
	spider.Stop()
}

Name		Name	Last commit message	Last commit date
Latest commit History 16 Commits
.drone.yml		.drone.yml
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
context.go		context.go
item.go		item.go
logo.png		logo.png
request.go		request.go
response.go		response.go
response_test.go		response_test.go
spider.go		spider.go
spider_test.go		spider_test.go
todo.org		todo.org
util.go		util.go
worker.go		worker.go

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

scrapegoat

Usage

About

Releases

Packages

Languages

License

hosiawak/scrapegoat

Folders and files

Latest commit

History

Repository files navigation

scrapegoat

Usage

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages