aboutsummaryrefslogtreecommitdiffstats
path: root/internal/crawler/crawler.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/crawler/crawler.go')
-rw-r--r--internal/crawler/crawler.go161
1 files changed, 161 insertions, 0 deletions
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
new file mode 100644
index 0000000..10253d8
--- /dev/null
+++ b/internal/crawler/crawler.go
@@ -0,0 +1,161 @@
+package crawler
+
+import (
+ "io/ioutil"
+ "log"
+ "net/http"
+ "time"
+
+ "adammathes.com/neko/internal/vlog"
+ "adammathes.com/neko/models/feed"
+ "adammathes.com/neko/models/item"
+ "github.com/mmcdole/gofeed"
+)
+
+const MAX_CRAWLERS = 5
+
+func Crawl() {
+ crawlJobs := make(chan *feed.Feed, 100)
+ results := make(chan string, 100)
+
+ feeds, err := feed.All()
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for i := 0; i < MAX_CRAWLERS; i++ {
+ vlog.Printf("spawning crawl worker %d\n", i)
+ go CrawlWorker(crawlJobs, results)
+ }
+
+ for _, f := range feeds {
+ vlog.Printf("sending crawl job %s\n", f.Url)
+ crawlJobs <- f
+ }
+ close(crawlJobs)
+
+ for i := 0; i < len(feeds); i++ {
+ vlog.Println(<-results)
+ }
+ close(results)
+}
+
+func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) {
+
+ for f := range feeds {
+ vlog.Printf("crawl job received %s\n", f.Url)
+ CrawlFeed(f, results)
+ vlog.Printf("crawl job finished %s\n", f.Url)
+ }
+}
+
+/*
+Simple HTTP Get fnx with custom user agent header
+*/
+func GetFeedContent(feedURL string) string {
+
+ // introduce delays for testing
+ // n := time.Duration(rand.Int63n(3))
+ // time.Sleep(n * time.Second)
+
+ c := &http.Client{
+ // give up after 5 seconds
+ Timeout: 5 * time.Second,
+ }
+
+ request, err := http.NewRequest("GET", feedURL, nil)
+ if err != nil {
+ log.Fatalln(err)
+ }
+
+ userAgent := "neko RSS Crawler +https://github.com/adammathes/neko"
+ request.Header.Set("User-Agent", userAgent)
+ resp, err := c.Do(request)
+
+ if err != nil {
+ return ""
+ }
+
+ if resp != nil {
+ defer func() {
+ ce := resp.Body.Close()
+ if ce != nil {
+ err = ce
+ }
+ }()
+ }
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return ""
+ }
+
+ bodyBytes, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return ""
+ }
+ return string(bodyBytes)
+}
+
+/*
+TODO: sanitize input on crawl
+*/
+func CrawlFeed(f *feed.Feed, ch chan<- string) {
+ c := &http.Client{
+ // give up after 5 seconds
+ Timeout: 5 * time.Second,
+ }
+
+ fp := gofeed.NewParser()
+ fp.Client = c
+
+ content := GetFeedContent(f.Url)
+ feed, err := fp.ParseString(content)
+ if err != nil {
+ vlog.Println(err)
+ ch <- "failed parse for " + f.Url + "\n"
+ return
+ }
+
+ f.Title = feed.Title
+ f.WebUrl = feed.Link
+ f.Update()
+
+ for _, i := range feed.Items {
+ vlog.Printf("storing item: %s\n", i.Link)
+ var item item.Item
+ item.Title = i.Title
+ item.Url = i.Link
+
+ item.Description = i.Description
+ if len(i.Content) > len(item.Description) {
+ item.Description = i.Content
+ }
+
+ // a lot of RSS2.0 generated by wordpress and others
+ // uses <content:encoded>
+ e, ok := i.Extensions["content"]["encoded"]
+ var encoded = ""
+ if ok {
+ encoded = e[0].Value
+ }
+ if len(encoded) > len(item.Description) {
+ item.Description = encoded
+ }
+
+ if i.PublishedParsed != nil {
+ item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05")
+ } else {
+ item.PublishDate = time.Now().Format("2006-01-02 15:04:05")
+ }
+
+ item.FeedId = f.Id
+ err := item.Create()
+ if err != nil {
+ vlog.Println(err)
+ }
+ // else {
+ // item.GetFullContent()
+ //}
+ }
+ ch <- "successfully crawled " + f.Url + "\n"
+}