Build simple web scraper with Golang API with GraphQL


I’m a sneakerhead.

You can find about 30 pairs of sneakers in my collection.

Monitoring sneakers releases and prices are super important things for me and whole community.

At this article I’ll show how to implement quick web parser and GraphQL API in Golang.

Let’s install two packages first:

github.com/PuerkitoBio/goquery and github.com/graphql-go/graphql

Cool, let’s use Solecollector as our data source engine.

https://solecollector.com/sneaker-release-dates/all-release-dates

Our final API implementation:

monit_web

Parser

At first step let’s describe simple struct responsible to store sneaker data.

type Sneaker struct {
	ID       int
	Title    string
	Price    string
	Date     string
	Image    string
	Provider string
}

var SneakerList []Sneaker

Nothing special, just simple Go struct with fields types and SneakerList for storing multiple sneakers.

Cool. Now let’s parse our webpage with goquery package and get our data. Super simple, right?

It looks like Nokogiri for Ruby if you familiar with it.

func parseUrl(url string) []Sneaker {
	fmt.Println("request: " + url)
	doc, err := goquery.NewDocument(url)
	_check(err)

	doc.Find(".release-group__container").Each(func(i int, item *goquery.Selection) {
		date1 := item.Find(".clg-releases__date__day").Text()
		date2 := item.Find(".clg-releases__date__month").Text()
		date := date1 + "/" + date2 + "/2019"
		item.Find(".sneaker-release-item").Each(func(i int, sneaker_block *goquery.Selection) {
			id := i + 1
			title := sneaker_block.Find(".sneaker-release__title").Text()
			price := strings.TrimSpace(sneaker_block.Find(".sneaker-release__option--price").Text())
			image, _ := sneaker_block.Find(".sneaker-release__img-16x9 a img").Attr("src")
			sneaker := Sneaker{id, title, price, date, image, "SOLECOLLECTOR"}
			SneakerList = append(SneakerList, sneaker)
		})
	})

	return SneakerList
}

At the end we do append to add our Sneaker struct to SneakerList.

GraphQL

At this sneakerType variable we init new GraphQL object with name Sneaker and different field types.

var sneakerType = graphql.NewObject(graphql.ObjectConfig{
	Name: "Sneaker",
	Fields: graphql.Fields{
		"id": &graphql.Field{
			Type: graphql.Int,
		},
		"title": &graphql.Field{
			Type: graphql.String,
		},
		"price": &graphql.Field{
			Type: graphql.String,
		},
		"date": &graphql.Field{
			Type: graphql.String,
		},
		"image": &graphql.Field{
			Type: graphql.String,
		},
		"provider": &graphql.Field{
			Type: graphql.String,
		},
	},
})

At the next step we will write our rootQuery - this query will be responsible for the all queries to the our API. I’ve added curl example to test how it works. So, as you can see there are two request types - for single Sneaker and Sneaker List.

var rootQuery = graphql.NewObject(graphql.ObjectConfig{
	Name: "RootQuery",
	Fields: graphql.Fields{

		/*
		   curl -g 'http://localhost:8080/graphql?query={sneaker(id:"b"){id,title}}'
		*/
		"sneaker": &graphql.Field{
			Type:        sneakerType,
			Description: "Get single sneaker",
			Args: graphql.FieldConfigArgument{
				"id": &graphql.ArgumentConfig{
					Type: graphql.String,
				},
			},
			Resolve: func(params graphql.ResolveParams) (interface{}, error) {

				idQuery, isOK := params.Args["id"].(int)
				if isOK {
					// Search for el with id
					for _, sneaker := range SneakerList {
						if sneaker.ID == idQuery {
							return sneaker, nil
						}
					}
				}

				return Sneaker{}, nil
			},
		},

		/*
		   curl -g 'http://localhost:8080/graphql?query={sneakerList{id,text,done}}'
		*/
		"sneakerList": &graphql.Field{
			Type:        graphql.NewList(sneakerType),
			Description: "List of sneakers",
			Resolve: func(p graphql.ResolveParams) (interface{}, error) {
				return SneakerList, nil
			},
		},
	},
})

Let’s describe our rootQuery at graphQL schema config and implement executeQuery method for the basic error routing and results return.

var schema, _ = graphql.NewSchema(graphql.SchemaConfig{
	Query: rootQuery
})

func executeQuery(query string, schema graphql.Schema) *graphql.Result {
	result := graphql.Do(graphql.Params{
		Schema:        schema,
		RequestString: query,
	})
	if len(result.Errors) > 0 {
		fmt.Printf("wrong result, unexpected errors: %v", result.Errors)
	}
	return result
}

At the final step we add some debug messages and finally describe our API endpoint.

Enjoy your API and check source at https://github.com/maratgaliev/sneakerstep.

func main() {
	http.HandleFunc("/graphql", func(w http.ResponseWriter, r *http.Request) {
		result := executeQuery(r.URL.Query().Get("query"), schema)
		json.NewEncoder(w).Encode(result)
	})
	// Serve static files
	fs := http.FileServer(http.Dir("static"))
	http.Handle("/", fs)
	// Display some basic instructions
	fmt.Println("Now server is running on port 8080")
	fmt.Println("Get single sneaker: curl -g 'http://localhost:8080/graphql?query={sneaker(id:\"b\"){id,text,done}}'")
	fmt.Println("Load sneaker list: curl -g 'http://localhost:8080/graphql?query={sneakerList{id,text,done}}'")
	fmt.Println("Access the web app via browser at 'http://localhost:8080'")

	http.ListenAndServe(":8080", nil)
}