How can I get HTML.title in c.OnResponse - or is there a better alternative to fill the Struct with url/title/content
type WebPage struct {
Url string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
}
// Print the response
c.OnResponse(func(r *colly.Response) {
pageCount++
log.Println(r.Headers)
webpage := WebPage{
Url: r.Ctx.Get("url"), //- can be put in ctx c.OnRequest, and r.Ctx.Get("url")
Title: "my title", //string(r.title), // Where to get this?
Content: string(r.Body), //string(r.Body) - can be done in c.OnResponse
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
enc.Encode(webpage) // SEND it to elasticsearch
log.Println(fmt.Sprintf("%d DONE Visiting : %s", pageCount, urlVisited))
})
I can get title in method like below, however Ctx is not available so I cant put the "title" value in Ctx. Other options?
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println(e.Text)
e.Ctx.Put("title", e.Text) // NOT ACCESSIBLE!
})
Logs
2020/05/07 17:42:37 7 DONE Visiting : https://www.coursera.org/learn/build-portfolio-website-html-css
{
"url": "https://www.coursera.org/learn/build-portfolio-website-html-css",
"title": "my page title",
"content": "page html body bla "
}
2020/05/07 17:42:37 8 DONE Visiting : https://www.coursera.org/learn/build-portfolio-website-html-css
{
"url": "https://www.coursera.org/browse/social-sciences",
"title": "my page title",
"content": "page html body bla "
}
I created a global variable of that struct and kept filling it in different methods
Not sure if this is the best way.
fun main(){
....
webpage := WebPage{} //Is this a right way to declare a mutable struct?
c.OnRequest(func(r *colly.Request) { // url
webpage.Url = r.URL.String() // Is this the right way to mutate?
})
c.OnResponse(func(r *colly.Response) { //get body
pageCount++
log.Println(fmt.Sprintf("%d DONE Visiting : %s", pageCount, webpage.Url))
})
c.OnHTML("head title", func(e *colly.HTMLElement) { // Title
webpage.Title = e.Text
})
c.OnHTML("html body", func(e *colly.HTMLElement) { // Body / content
webpage.Content = e.Text // Can url title body be misrepresented in multithread scenario?
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) { // href , callback
link := e.Attr("href")
e.Request.Visit(link)
})
c.OnError(func(r *colly.Response, err error) { // Set error handler
log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
c.OnScraped(func(r *colly.Response) { // DONE
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
enc.Encode(webpage)
})