aboutsummaryrefslogtreecommitdiffstats
path: root/models/item
diff options
context:
space:
mode:
authorAdam Mathes <adam@adammathes.com>2026-02-13 17:04:11 -0800
committerAdam Mathes <adam@adammathes.com>2026-02-13 17:04:11 -0800
commit993fa16af4fb5891fa9eb06be728933030cf3953 (patch)
treeeeb0413e5380cb67a58af872b494d318701ff325 /models/item
parentfb7d5bfb0b780486d3b6191dda7c0a340abe286e (diff)
downloadneko-993fa16af4fb5891fa9eb06be728933030cf3953.tar.gz
neko-993fa16af4fb5891fa9eb06be728933030cf3953.tar.bz2
neko-993fa16af4fb5891fa9eb06be728933030cf3953.zip
fix(scraper): correctly save full content and header image
Diffstat (limited to 'models/item')
-rw-r--r--models/item/item.go54
1 files changed, 37 insertions, 17 deletions
diff --git a/models/item/item.go b/models/item/item.go
index 571e942..c39f623 100644
--- a/models/item/item.go
+++ b/models/item/item.go
@@ -14,6 +14,19 @@ import (
"github.com/russross/blackfriday"
)
+type ContentExtractor interface {
+ Extract(url string) (*goose.Article, error)
+}
+
+type GooseExtractor struct{}
+
+func (ge GooseExtractor) Extract(url string) (*goose.Article, error) {
+ g := goose.New()
+ return g.ExtractFromURL(url)
+}
+
+var Extractor ContentExtractor = GooseExtractor{}
+
type Item struct {
Id int64 `json:"_id,string,omitempty"`
@@ -92,35 +105,29 @@ func ItemById(id int64) *Item {
func (i *Item) GetFullContent() {
fmt.Printf("fetching from %s\n", i.Url)
- g := goose.New()
- article, err := g.ExtractFromURL(i.Url)
+ article, err := Extractor.Extract(i.Url)
if err != nil {
vlog.Println(err)
return
}
- if article.TopNode == nil {
- return
+ // i.FullContent and i.HeaderImage will be updated during extraction
+ if article.CleanedText != "" {
+ i.FullContent = string(blackfriday.Run([]byte(article.CleanedText)))
}
- var md, img string
- md = ""
- img = ""
- md = string(blackfriday.Run([]byte(article.CleanedText)))
-
- ht, err := article.TopNode.Html()
- if err != nil {
- vlog.Println(err)
- return
+ if article.TopNode != nil {
+ ht, err := article.TopNode.Html()
+ if err == nil {
+ p := filterPolicy()
+ i.FullContent = p.Sanitize(ht)
+ }
}
-
- p := filterPolicy()
- i.FullContent = p.Sanitize(ht)
i.HeaderImage = article.TopImage
_, err = models.DB.Exec(`UPDATE item
SET full_content=?, header_image=?
- WHERE id=?`, md, img, i.Id)
+ WHERE id=?`, i.FullContent, i.HeaderImage, i.Id)
if err != nil {
vlog.Println(err)
}
@@ -241,6 +248,19 @@ func rewriteImages(s string) string {
if src, ok := img.Attr("src"); ok {
img.SetAttr("src", proxyURL(src))
}
+ if srcset, ok := img.Attr("srcset"); ok {
+ // srcset is a comma-separated list of "url descriptor"
+ parts := strings.Split(srcset, ",")
+ for i, part := range parts {
+ part = strings.TrimSpace(part)
+ subparts := strings.Fields(part)
+ if len(subparts) > 0 {
+ subparts[0] = proxyURL(subparts[0])
+ }
+ parts[i] = strings.Join(subparts, " ")
+ }
+ img.SetAttr("srcset", strings.Join(parts, ", "))
+ }
})
output, _ := doc.Html()