From 741cd94da338e00e2eee0e12d572e04c3be46530 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Sun, 19 Nov 2017 21:23:04 -0700 Subject: enable full_text and header_image --- crawler/crawler.go | 17 +++++++++-------- init.sql | 1 + models/item/item.go | 55 +++++++++++++++++++++++++++++++++++++++-------------- static/ui.html | 7 ++++++- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index ea9f694..e84e219 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -1,21 +1,20 @@ package crawler import ( - "log" "adammathes.com/neko/models/feed" "adammathes.com/neko/models/item" + "adammathes.com/neko/vlog" + "github.com/mmcdole/gofeed" + "log" "net/http" "time" - "github.com/mmcdole/gofeed" - "adammathes.com/neko/vlog" ) - func Crawl() { ch := make(chan string) - feeds,err := feed.All() + feeds, err := feed.All() if err != nil { log.Fatal(err) } @@ -64,7 +63,7 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { // a lot of RSS2.0 generated by wordpress and others // uses - e,ok := i.Extensions["content"]["encoded"] + e, ok := i.Extensions["content"]["encoded"] var encoded = "" if ok { encoded = e[0].Value @@ -73,16 +72,18 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { item.Description = encoded } - if(i.PublishedParsed != nil) { + if i.PublishedParsed != nil { item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05") } else { item.PublishDate = time.Now().Format("2006-01-02 15:04:05") } - + item.FeedId = f.Id err := item.Create() if err != nil { vlog.Println(err) + } else { + item.GetFullContent() } } ch <- "successfully crawled " + f.Url + "\n" diff --git a/init.sql b/init.sql index d69d5e9..d2c0de1 100644 --- a/init.sql +++ b/init.sql @@ -17,6 +17,7 @@ CREATE TABLE item ( title TEXT, url VARCHAR(255) NOT NULL, description TEXT, + full_content TEXT, publish_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL, read_state BOOLEAN DEFAULT FALSE NOT NULL, starred BOOLEAN DEFAULT FALSE NOT NULL, diff --git a/models/item/item.go b/models/item/item.go index 5a4c274..48d76bf 100644 --- a/models/item/item.go +++ b/models/item/item.go @@ -1,26 +1,32 @@ package item import ( - "fmt" - "log" "adammathes.com/neko/models" + "fmt" + "github.com/advancedlogic/GoOse" "github.com/microcosm-cc/bluemonday" + "github.com/russross/blackfriday" + "log" ) type Item struct { - Id int64 `json:"_id,string,omitempty"` + Id int64 `json:"_id,string,omitempty"` + + Title string `json:"title"` + Url string `json:"url"` - Title string `json:"title"` - Url string `json:"url"` Description string `json:"description"` PublishDate string `json:"publish_date"` - FeedId int64 - FeedTitle string `json:"feed_title"` - FeedUrl string `json:"feed_url"` + FeedId int64 + FeedTitle string `json:"feed_title"` + FeedUrl string `json:"feed_url"` + + ReadState bool `json:"read"` + Starred bool `json:"starred"` - ReadState bool `json:"read"` - Starred bool `json:"starred"` + FullContent string `json:"full_content"` + HeaderImage string `json:"header_image"` } func (i *Item) Print() { @@ -61,12 +67,34 @@ func (i *Item) FullSave() { } } +func (i *Item) GetFullContent() { + g := goose.New() + article, err := g.ExtractFromURL(i.Url) + var md, img string + md = "" + img = "" + if err != nil { + log.Println(err) + } else { + md = string(blackfriday.MarkdownCommon([]byte(article.CleanedText))) + img = article.TopImage + } + + _, err = models.DB.Exec(`UPDATE item + SET full_content=?, header_image=? + WHERE id=?`, md, img, i.Id) + if err != nil { + log.Println(err) + } +} + func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([]*Item, error) { var args []interface{} query := `SELECT item.id, item.title, item.url, item.description, item.read_state, item.starred, item.publish_date, + item.full_content, item.header_image, feed.url, feed.title FROM item,feed WHERE item.feed_id=feed.id ` @@ -89,7 +117,6 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([ query = query + " AND item.starred=1 " } - query = query + "ORDER BY item.id DESC LIMIT 15" // log.Println(query) // log.Println(args...) @@ -106,16 +133,15 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([ p.AllowAttrs("href").OnElements("a") p.AllowAttrs("src", "alt").OnElements("img") - items := make([]*Item, 0) for rows.Next() { i := new(Item) - err := rows.Scan(&i.Id, &i.Title, &i.Url, &i.Description, &i.ReadState, &i.Starred, &i.PublishDate, &i.FeedUrl, &i.FeedTitle) + err := rows.Scan(&i.Id, &i.Title, &i.Url, &i.Description, &i.ReadState, &i.Starred, &i.PublishDate, &i.FullContent, &i.HeaderImage, &i.FeedUrl, &i.FeedTitle) if err != nil { log.Println(err) return nil, err } - + // sanitize all fields from external input // should do this at ingest time, probably, for efficiency // but still may need to adjust rules @@ -124,6 +150,7 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([ i.Url = p.Sanitize(i.Url) i.FeedTitle = p.Sanitize(i.FeedTitle) i.FeedUrl = p.Sanitize(i.FeedUrl) + i.FullContent = p.Sanitize(i.FullContent) items = append(items, i) } if err = rows.Err(); err != nil { diff --git a/static/ui.html b/static/ui.html index 7b1d08c..b977dad 100644 --- a/static/ui.html +++ b/static/ui.html @@ -39,7 +39,12 @@ ${ item.publish_date } from ${item.feed_title}

{{html item.description}}
- + +

full content

+
+
{{html item.full_content}}
+ +