aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crawler/crawler.go17
-rw-r--r--init.sql1
-rw-r--r--models/item/item.go55
-rw-r--r--static/ui.html7
4 files changed, 57 insertions, 23 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go
index ea9f694..e84e219 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -1,21 +1,20 @@
package crawler
import (
- "log"
"adammathes.com/neko/models/feed"
"adammathes.com/neko/models/item"
+ "adammathes.com/neko/vlog"
+ "github.com/mmcdole/gofeed"
+ "log"
"net/http"
"time"
- "github.com/mmcdole/gofeed"
- "adammathes.com/neko/vlog"
)
-
func Crawl() {
ch := make(chan string)
- feeds,err := feed.All()
+ feeds, err := feed.All()
if err != nil {
log.Fatal(err)
}
@@ -64,7 +63,7 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) {
// a lot of RSS2.0 generated by wordpress and others
// uses <content:encoded>
- e,ok := i.Extensions["content"]["encoded"]
+ e, ok := i.Extensions["content"]["encoded"]
var encoded = ""
if ok {
encoded = e[0].Value
@@ -73,16 +72,18 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) {
item.Description = encoded
}
- if(i.PublishedParsed != nil) {
+ if i.PublishedParsed != nil {
item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05")
} else {
item.PublishDate = time.Now().Format("2006-01-02 15:04:05")
}
-
+
item.FeedId = f.Id
err := item.Create()
if err != nil {
vlog.Println(err)
+ } else {
+ item.GetFullContent()
}
}
ch <- "successfully crawled " + f.Url + "\n"
diff --git a/init.sql b/init.sql
index d69d5e9..d2c0de1 100644
--- a/init.sql
+++ b/init.sql
@@ -17,6 +17,7 @@ CREATE TABLE item (
title TEXT,
url VARCHAR(255) NOT NULL,
description TEXT,
+ full_content TEXT,
publish_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL,
read_state BOOLEAN DEFAULT FALSE NOT NULL,
starred BOOLEAN DEFAULT FALSE NOT NULL,
diff --git a/models/item/item.go b/models/item/item.go
index 5a4c274..48d76bf 100644
--- a/models/item/item.go
+++ b/models/item/item.go
@@ -1,26 +1,32 @@
package item
import (
- "fmt"
- "log"
"adammathes.com/neko/models"
+ "fmt"
+ "github.com/advancedlogic/GoOse"
"github.com/microcosm-cc/bluemonday"
+ "github.com/russross/blackfriday"
+ "log"
)
type Item struct {
- Id int64 `json:"_id,string,omitempty"`
+ Id int64 `json:"_id,string,omitempty"`
+
+ Title string `json:"title"`
+ Url string `json:"url"`
- Title string `json:"title"`
- Url string `json:"url"`
Description string `json:"description"`
PublishDate string `json:"publish_date"`
- FeedId int64
- FeedTitle string `json:"feed_title"`
- FeedUrl string `json:"feed_url"`
+ FeedId int64
+ FeedTitle string `json:"feed_title"`
+ FeedUrl string `json:"feed_url"`
+
+ ReadState bool `json:"read"`
+ Starred bool `json:"starred"`
- ReadState bool `json:"read"`
- Starred bool `json:"starred"`
+ FullContent string `json:"full_content"`
+ HeaderImage string `json:"header_image"`
}
func (i *Item) Print() {
@@ -61,12 +67,34 @@ func (i *Item) FullSave() {
}
}
+func (i *Item) GetFullContent() {
+ g := goose.New()
+ article, err := g.ExtractFromURL(i.Url)
+ var md, img string
+ md = ""
+ img = ""
+ if err != nil {
+ log.Println(err)
+ } else {
+ md = string(blackfriday.MarkdownCommon([]byte(article.CleanedText)))
+ img = article.TopImage
+ }
+
+ _, err = models.DB.Exec(`UPDATE item
+ SET full_content=?, header_image=?
+ WHERE id=?`, md, img, i.Id)
+ if err != nil {
+ log.Println(err)
+ }
+}
+
func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([]*Item, error) {
var args []interface{}
query := `SELECT item.id, item.title, item.url, item.description,
item.read_state, item.starred, item.publish_date,
+ item.full_content, item.header_image,
feed.url, feed.title
FROM item,feed
WHERE item.feed_id=feed.id `
@@ -89,7 +117,6 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([
query = query + " AND item.starred=1 "
}
-
query = query + "ORDER BY item.id DESC LIMIT 15"
// log.Println(query)
// log.Println(args...)
@@ -106,16 +133,15 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([
p.AllowAttrs("href").OnElements("a")
p.AllowAttrs("src", "alt").OnElements("img")
-
items := make([]*Item, 0)
for rows.Next() {
i := new(Item)
- err := rows.Scan(&i.Id, &i.Title, &i.Url, &i.Description, &i.ReadState, &i.Starred, &i.PublishDate, &i.FeedUrl, &i.FeedTitle)
+ err := rows.Scan(&i.Id, &i.Title, &i.Url, &i.Description, &i.ReadState, &i.Starred, &i.PublishDate, &i.FullContent, &i.HeaderImage, &i.FeedUrl, &i.FeedTitle)
if err != nil {
log.Println(err)
return nil, err
}
-
+
// sanitize all fields from external input
// should do this at ingest time, probably, for efficiency
// but still may need to adjust rules
@@ -124,6 +150,7 @@ func Filter(max_id int64, feed_id int64, unread_only bool, starred_only bool) ([
i.Url = p.Sanitize(i.Url)
i.FeedTitle = p.Sanitize(i.FeedTitle)
i.FeedUrl = p.Sanitize(i.FeedUrl)
+ i.FullContent = p.Sanitize(i.FullContent)
items = append(items, i)
}
if err = rows.Err(); err != nil {
diff --git a/static/ui.html b/static/ui.html
index 7b1d08c..b977dad 100644
--- a/static/ui.html
+++ b/static/ui.html
@@ -39,7 +39,12 @@
${ item.publish_date } from <a href="${item.feed_url}">${item.feed_title}</a>
</p>
<div class="description">{{html item.description}}</div>
- </script>
+
+ <h3>full content</h3>
+ <div class="img"><img src="${item.header_image}" /></div>
+ <div class="description">{{html item.full_content}}</div>
+
+ </script>
<script id="tag_template" type="text/jqtmp">
${tag.name} ${tag.unread}