From e9f84c8a3d399789fc49ce0ee50ba257261b9e80 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Mon, 30 Apr 2018 11:41:42 -0700 Subject: set limit to number of simultaneous crawl jobs --- crawler/crawler.go | 34 ++++++++++++++++++++++++++++------ models/item/item.go | 2 +- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index a29f814..e52e491 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -11,21 +11,39 @@ import ( "time" ) -func Crawl() { +const MAX_CRAWLERS = 5 - ch := make(chan string) +func Crawl() { + crawlJobs := make(chan *feed.Feed, 100) + results := make(chan string, 100) feeds, err := feed.All() if err != nil { log.Fatal(err) } + + for i := 0; i < MAX_CRAWLERS; i++ { + vlog.Printf("spawning crawl worker %d\n", i) + go CrawlWorker(crawlJobs, results) + } + for _, f := range feeds { - vlog.Printf("crawling %s\n", f.Url) - go CrawlFeed(f, ch) + vlog.Printf("sending crawl job %s\n", f.Url) + crawlJobs <- f } + close(crawlJobs) for i := 0; i < len(feeds); i++ { - vlog.Println(<-ch) + vlog.Println(<-results) + } +} + +func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) { + + for f := range feeds { + vlog.Printf("crawl job recieved %s\n", f.Url) + CrawlFeed(f, results) + vlog.Printf("crawl job finished %s\n", f.Url) } } @@ -34,6 +52,10 @@ Simple HTTP Get fnx with custom user agent header */ func GetFeedContent(feedURL string) string { + // introduce delays for testing + // n := time.Duration(rand.Int63n(3)) + // time.Sleep(n * time.Second) + c := &http.Client{ // give up after 5 seconds Timeout: 5 * time.Second, @@ -88,7 +110,7 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { feed, err := fp.ParseString(content) if err != nil { vlog.Println(err) - ch <- "failed to fetch and parse for " + f.Url + "\n" + ch <- "failed parse for " + f.Url + "\n" return } diff --git a/models/item/item.go b/models/item/item.go index 52d3d37..dd8b52c 100644 --- a/models/item/item.go +++ b/models/item/item.go @@ -45,7 +45,7 @@ func (i *Item) Create() error { item(title, url, description, publish_date, feed_id) VALUES(?, ?, ?, ?, ?)`, i.Title, i.Url, i.Description, i.PublishDate, i.FeedId) if err != nil { - vlog.Printf("Error on item.Create\n%v\n%v\n", i, err) + vlog.Printf("Error on item.Create\n%v\n%v\n", i.Url, err) return err } -- cgit v1.2.3