package crawler import ( "log" "github.com/adammathes/neko/models/feed" "neko/models/item" "net/http" "time" "github.com/mmcdole/gofeed" ) func Crawl() { ch := make(chan string) feeds,err := feed.All() if err != nil { log.Fatal(err) } for _, f := range feeds { log.Printf("crawling %s", f.Url) go CrawlFeed(f, ch) } for i := 0; i < len(feeds); i++ { log.Println(<-ch) } } /* TODO: sanitize input on crawl */ func CrawlFeed(f *feed.Feed, ch chan<- string) { c := &http.Client{ // give up after 5 seconds Timeout: 5 * time.Second, } fp := gofeed.NewParser() fp.Client = c feed, err := fp.ParseURL(f.Url) if err != nil { log.Print(err) ch <- "failed to fetch and parse for " + f.Url return } f.Title = feed.Title f.Update() for _, i := range feed.Items { log.Printf("storing item: %s", i.Title) var item item.Item item.Title = i.Title item.Url = i.Link item.Description = i.Description if len(i.Content) > len(item.Description) { item.Description = i.Content } // a lot of RSS2.0 generated by wordpress and others // uses e,ok := i.Extensions["content"]["encoded"] var encoded = "" if ok { encoded = e[0].Value } if len(encoded) > len(item.Description) { item.Description = encoded } item.PublishDate = i.PublishedParsed.Format(time.RFC3339) item.FeedId = f.Id err := item.Create() if err != nil { log.Println(err) } } ch <- "successfully crawled " + f.Url }