From 97e935a1947291d6b422877906dbb75e22b05e2d Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Sat, 8 Jun 2019 17:02:44 -0700 Subject: some hacky regex url/img extraction stuff --- render/render.go | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'render/render.go') diff --git a/render/render.go b/render/render.go index bf53208..9698ad8 100644 --- a/render/render.go +++ b/render/render.go @@ -15,8 +15,7 @@ import ( var templates map[string]*template.Template var BASE_TEMPLATE = "base" -var rel_href *regexp.Regexp -var rel_src *regexp.Regexp +var rel_href, rel_src, re_href, re_src *regexp.Regexp /* Renderable interface - objects that render themeslves to a []byte and @@ -86,6 +85,8 @@ func Init() { } rel_href = regexp.MustCompile(`href="/(.+)"`) rel_src = regexp.MustCompile(`src="/(.+)"`) + re_href = regexp.MustCompile(`href="(.*?)"`) + re_src = regexp.MustCompile(`src="(.*?)"`) } /* @@ -129,6 +130,34 @@ func ResolveURLs(html, prefix string) string { return string(bts) } +/* +Finds all URLs that are hrefs +TODO: replace noisy regex with HTML parser +*/ +func FindURLs(html string) []string { + // bts := []byte(html) + hrefs := re_href.FindAllStringSubmatch(html, -1) + var urls []string + for _, href := range hrefs { + urls = append(urls, href[1]) + } + return urls +} + +/* +Finds all img urls via img src tags +TODO: replace noisy regex with HTML parser +*/ +func FindImgs(html string) []string { + // bts := []byte(html) + srcs := re_src.FindAllStringSubmatch(html, -1) + var imgs []string + for _, src := range srcs { + imgs = append(imgs, src[1]) + } + return imgs +} + /* Runs all regex filters specified in config.Config.Filters */ -- cgit v1.2.3