From e3c379d069ffa9661561d25cdbf2f5894a2f8ee8 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Sat, 14 Feb 2026 08:58:38 -0800 Subject: Refactor: project structure, implement dependency injection, and align v2 UI with v1 --- crawler/crawler.go | 160 ----------------------------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 crawler/crawler.go (limited to 'crawler/crawler.go') diff --git a/crawler/crawler.go b/crawler/crawler.go deleted file mode 100644 index f8794d4..0000000 --- a/crawler/crawler.go +++ /dev/null @@ -1,160 +0,0 @@ -package crawler - -import ( - "adammathes.com/neko/models/feed" - "adammathes.com/neko/models/item" - "adammathes.com/neko/vlog" - "github.com/mmcdole/gofeed" - "io/ioutil" - "log" - "net/http" - "time" -) - -const MAX_CRAWLERS = 5 - -func Crawl() { - crawlJobs := make(chan *feed.Feed, 100) - results := make(chan string, 100) - - feeds, err := feed.All() - if err != nil { - log.Fatal(err) - } - - for i := 0; i < MAX_CRAWLERS; i++ { - vlog.Printf("spawning crawl worker %d\n", i) - go CrawlWorker(crawlJobs, results) - } - - for _, f := range feeds { - vlog.Printf("sending crawl job %s\n", f.Url) - crawlJobs <- f - } - close(crawlJobs) - - for i := 0; i < len(feeds); i++ { - vlog.Println(<-results) - } - close(results) -} - -func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) { - - for f := range feeds { - vlog.Printf("crawl job received %s\n", f.Url) - CrawlFeed(f, results) - vlog.Printf("crawl job finished %s\n", f.Url) - } -} - -/* -Simple HTTP Get fnx with custom user agent header -*/ -func GetFeedContent(feedURL string) string { - - // introduce delays for testing - // n := time.Duration(rand.Int63n(3)) - // time.Sleep(n * time.Second) - - c := &http.Client{ - // give up after 5 seconds - Timeout: 5 * time.Second, - } - - request, err := http.NewRequest("GET", feedURL, nil) - if err != nil { - log.Fatalln(err) - } - - userAgent := "neko RSS Crawler +https://github.com/adammathes/neko" - request.Header.Set("User-Agent", userAgent) - resp, err := c.Do(request) - - if err != nil { - return "" - } - - if resp != nil { - defer func() { - ce := resp.Body.Close() - if ce != nil { - err = ce - } - }() - } - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return "" - } - - bodyBytes, err := ioutil.ReadAll(resp.Body) - if err != nil { - return "" - } - return string(bodyBytes) -} - -/* - TODO: sanitize input on crawl -*/ -func CrawlFeed(f *feed.Feed, ch chan<- string) { - c := &http.Client{ - // give up after 5 seconds - Timeout: 5 * time.Second, - } - - fp := gofeed.NewParser() - fp.Client = c - - content := GetFeedContent(f.Url) - feed, err := fp.ParseString(content) - if err != nil { - vlog.Println(err) - ch <- "failed parse for " + f.Url + "\n" - return - } - - f.Title = feed.Title - f.WebUrl = feed.Link - f.Update() - - for _, i := range feed.Items { - vlog.Printf("storing item: %s\n", i.Link) - var item item.Item - item.Title = i.Title - item.Url = i.Link - - item.Description = i.Description - if len(i.Content) > len(item.Description) { - item.Description = i.Content - } - - // a lot of RSS2.0 generated by wordpress and others - // uses - e, ok := i.Extensions["content"]["encoded"] - var encoded = "" - if ok { - encoded = e[0].Value - } - if len(encoded) > len(item.Description) { - item.Description = encoded - } - - if i.PublishedParsed != nil { - item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05") - } else { - item.PublishDate = time.Now().Format("2006-01-02 15:04:05") - } - - item.FeedId = f.Id - err := item.Create() - if err != nil { - vlog.Println(err) - } - // else { - // item.GetFullContent() - //} - } - ch <- "successfully crawled " + f.Url + "\n" -} -- cgit v1.2.3