From 29475b918a1f227429d297f50ebb982363b8fd65 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Tue, 20 Feb 2018 21:47:26 -0800 Subject: custom user-agent for crawler --- crawler/crawler.go | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'crawler') diff --git a/crawler/crawler.go b/crawler/crawler.go index b66dd46..9c99077 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -4,7 +4,9 @@ import ( "adammathes.com/neko/models/feed" "adammathes.com/neko/models/item" "adammathes.com/neko/vlog" + "fmt" "github.com/mmcdole/gofeed" + "io/ioutil" "log" "net/http" "time" @@ -28,6 +30,49 @@ func Crawl() { } } +/* +Simple HTTP Get fnx with custom user agent header +*/ +func GetFeedContent(feedURL string) string { + + c := &http.Client{ + // give up after 5 seconds + Timeout: 5 * time.Second, + } + + request, err := http.NewRequest("GET", feedURL, nil) + if err != nil { + log.Fatalln(err) + } + + userAgent := "neko RSS Crawler +https://github.com/adammathes/neko" + request.Header.Set("User-Agent", userAgent) + resp, err := c.Do(request) + + if err != nil { + return "" + } + + if resp != nil { + defer func() { + ce := resp.Body.Close() + if ce != nil { + err = ce + } + }() + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "" + } + + bodyBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "" + } + return string(bodyBytes) +} + /* TODO: sanitize input on crawl */ @@ -40,7 +85,9 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { fp := gofeed.NewParser() fp.Client = c - feed, err := fp.ParseURL(f.Url) + content := GetFeedContent(f.Url) + fmt.Printf("%v", content) + feed, err := fp.ParseString(content) if err != nil { vlog.Println(err) ch <- "failed to fetch and parse for " + f.Url + "\n" -- cgit v1.2.3