aboutsummaryrefslogtreecommitdiffstats
path: root/crawler
diff options
context:
space:
mode:
authorAdam Mathes <adam@trenchant.org>2018-02-20 21:47:26 -0800
committerAdam Mathes <adam@trenchant.org>2018-02-20 21:47:26 -0800
commit29475b918a1f227429d297f50ebb982363b8fd65 (patch)
treea4648d44099f1d283fb59f24fbcd8bde22e2fedb /crawler
parent228a0f3cc10e59b4d866662da8e9d53171c1e009 (diff)
downloadneko-29475b918a1f227429d297f50ebb982363b8fd65.tar.gz
neko-29475b918a1f227429d297f50ebb982363b8fd65.tar.bz2
neko-29475b918a1f227429d297f50ebb982363b8fd65.zip
custom user-agent for crawler
Diffstat (limited to 'crawler')
-rw-r--r--crawler/crawler.go49
1 files changed, 48 insertions, 1 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go
index b66dd46..9c99077 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -4,7 +4,9 @@ import (
"adammathes.com/neko/models/feed"
"adammathes.com/neko/models/item"
"adammathes.com/neko/vlog"
+ "fmt"
"github.com/mmcdole/gofeed"
+ "io/ioutil"
"log"
"net/http"
"time"
@@ -29,6 +31,49 @@ func Crawl() {
}
/*
+Simple HTTP Get fnx with custom user agent header
+*/
+func GetFeedContent(feedURL string) string {
+
+ c := &http.Client{
+ // give up after 5 seconds
+ Timeout: 5 * time.Second,
+ }
+
+ request, err := http.NewRequest("GET", feedURL, nil)
+ if err != nil {
+ log.Fatalln(err)
+ }
+
+ userAgent := "neko RSS Crawler +https://github.com/adammathes/neko"
+ request.Header.Set("User-Agent", userAgent)
+ resp, err := c.Do(request)
+
+ if err != nil {
+ return ""
+ }
+
+ if resp != nil {
+ defer func() {
+ ce := resp.Body.Close()
+ if ce != nil {
+ err = ce
+ }
+ }()
+ }
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return ""
+ }
+
+ bodyBytes, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return ""
+ }
+ return string(bodyBytes)
+}
+
+/*
TODO: sanitize input on crawl
*/
func CrawlFeed(f *feed.Feed, ch chan<- string) {
@@ -40,7 +85,9 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) {
fp := gofeed.NewParser()
fp.Client = c
- feed, err := fp.ParseURL(f.Url)
+ content := GetFeedContent(f.Url)
+ fmt.Printf("%v", content)
+ feed, err := fp.ParseString(content)
if err != nil {
vlog.Println(err)
ch <- "failed to fetch and parse for " + f.Url + "\n"