Skip to content

Commit

Permalink
Add cdp support for xpath scrapers (stashapp#625)
Browse files Browse the repository at this point in the history
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
  • Loading branch information
bnkai and WithoutPants authored Aug 4, 2020
1 parent 1a63f6a commit 5beda52
Show file tree
Hide file tree
Showing 284 changed files with 133,250 additions and 54 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ require (
github.com/99designs/gqlgen v0.9.0
github.com/antchfx/htmlquery v1.2.3
github.com/bmatcuk/doublestar v1.3.1
github.com/chromedp/cdproto v0.0.0-20200608134039-8a80cdaf865c
github.com/chromedp/chromedp v0.5.3
github.com/disintegration/imaging v1.6.0
github.com/go-chi/chi v4.0.2+incompatible
github.com/gobuffalo/packr/v2 v2.0.2
Expand Down
17 changes: 17 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ github.com/bmatcuk/doublestar v1.3.1/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/bradfitz/go-smtpd v0.0.0-20170404230938-deb6d6237625/go.mod h1:HYsPBTaaSFSlLx/70C2HPIMNZpVV8+vt/A+FMnYP11g=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
github.com/chromedp/cdproto v0.0.0-20200608134039-8a80cdaf865c h1:qM1xzKK8kc93zKPkxK4iqtjksqDDrU6g9wGnr30jyLo=
github.com/chromedp/cdproto v0.0.0-20200608134039-8a80cdaf865c/go.mod h1:E6LPWRdIJc11h/di5p0rwvRmUYbhGpBEH7ZbPfzDIOE=
github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg=
github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ=
github.com/cockroachdb/cockroach-go v0.0.0-20181001143604-e0a95dfd547c/go.mod h1:XGLbWH/ujMcbPbhZq52Nv6UrCghb1yGn//133kEsvDk=
Expand Down Expand Up @@ -310,6 +315,12 @@ github.com/gobuffalo/uuid v2.0.5+incompatible/go.mod h1:ErhIzkRhm0FtRuiE/PeORqcw
github.com/gobuffalo/validate v2.0.3+incompatible/go.mod h1:N+EtDe0J8252BgfzQUChBgfd6L93m9weay53EWFVsMM=
github.com/gobuffalo/x v0.0.0-20181003152136-452098b06085/go.mod h1:WevpGD+5YOreDJznWevcn8NTmQEW5STSBgIkpkjzqXc=
github.com/gobuffalo/x v0.0.0-20181007152206-913e47c59ca7/go.mod h1:9rDPXaB3kXdKWzMc4odGQQdG2e2DIEmANy5aSJ9yesY=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
github.com/gocql/gocql v0.0.0-20190301043612-f6df8288f9b4/go.mod h1:4Fw1eo5iaEhDUs8XyuhSVCVy52Jq3L+/3GJgYkwc+/0=
github.com/gofrs/uuid v3.1.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
github.com/gofrs/uuid v3.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
Expand Down Expand Up @@ -408,6 +419,8 @@ github.com/karrick/godirwalk v1.7.8/go.mod h1:2c9FRhkDxdIbgkOnCEvnSWs71Bhugbl46s
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
github.com/konsorten/go-windows-terminal-sequences v0.0.0-20180402223658-b729f2633dfe/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
Expand All @@ -425,6 +438,9 @@ github.com/lib/pq v1.0.0 h1:X5PMW56eZitiTeO7tKzZxFCSpbFZJtkMMooicw2us9A=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8=
github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
github.com/markbates/deplist v1.0.4/go.mod h1:gRRbPbbuA8TmMiRvaOzUlRfzfjeCCBqX2A6arxN01MM=
github.com/markbates/deplist v1.0.5/go.mod h1:gRRbPbbuA8TmMiRvaOzUlRfzfjeCCBqX2A6arxN01MM=
github.com/markbates/going v1.0.2/go.mod h1:UWCk3zm0UKefHZ7l8BNqi26UyiEMniznk8naLdTcy6c=
Expand Down Expand Up @@ -722,6 +738,7 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190426135247-a129542de9ae h1:mQLHiymj/JXKnnjc62tb7nD5pZLs940/sXJu+Xp3DBA=
golang.org/x/sys v0.0.0-20190426135247-a129542de9ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
Expand Down
1 change: 1 addition & 0 deletions graphql/documents/data/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ fragment ConfigGeneralData on ConfigGeneralResult {
logAccess
excludes
scraperUserAgent
scraperCDPPath
}

fragment ConfigInterfaceData on ConfigInterfaceResult {
Expand Down
4 changes: 4 additions & 0 deletions graphql/schema/types/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ input ConfigGeneralInput {
excludes: [String!]
"""Scraper user agent string"""
scraperUserAgent: String
"""Scraper CDP path. Path to chrome executable or remote address"""
scraperCDPPath: String
}

type ConfigGeneralResult {
Expand Down Expand Up @@ -101,6 +103,8 @@ type ConfigGeneralResult {
excludes: [String!]!
"""Scraper user agent string"""
scraperUserAgent: String
"""Scraper CDP path. Path to chrome executable or remote address"""
scraperCDPPath: String
}

input ConfigInterfaceInput {
Expand Down
10 changes: 10 additions & 0 deletions pkg/api/resolver_mutation_configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,25 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co
config.Set(config.Exclude, input.Excludes)
}

refreshScraperCache := false
if input.ScraperUserAgent != nil {
config.Set(config.ScraperUserAgent, input.ScraperUserAgent)
refreshScraperCache = true
}

if input.ScraperCDPPath != nil {
config.Set(config.ScraperCDPPath, input.ScraperCDPPath)
refreshScraperCache = true
}

if err := config.Write(); err != nil {
return makeConfigGeneralResult(), err
}

manager.GetInstance().RefreshConfig()
if refreshScraperCache {
manager.GetInstance().RefreshScraperCache()
}

return makeConfigGeneralResult(), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/resolver_query_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize()

scraperUserAgent := config.GetScraperUserAgent()
scraperCDPPath := config.GetScraperCDPPath()

return &models.ConfigGeneralResult{
Stashes: config.GetStashPaths(),
Expand All @@ -62,6 +63,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
LogAccess: config.GetLogAccess(),
Excludes: config.GetExcludes(),
ScraperUserAgent: &scraperUserAgent,
ScraperCDPPath: &scraperCDPPath,
}
}

Expand Down
7 changes: 7 additions & 0 deletions pkg/manager/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ const SessionStoreKey = "session_store_key"
// scraping options
const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"
const ScraperCDPPath = "scraper_cdp_path"

// i18n
const Language = "language"
Expand Down Expand Up @@ -158,6 +159,12 @@ func GetScraperUserAgent() string {
return viper.GetString(ScraperUserAgent)
}

// GetScraperCDPPath gets the path to the Chrome executable or remote address
// to an instance of Chrome.
func GetScraperCDPPath() string {
return viper.GetString(ScraperCDPPath)
}

func GetHost() string {
return viper.GetString(Host)
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,12 @@ func initLog() {
logger.Init(config.GetLogFile(), config.GetLogOut(), config.GetLogLevel())
}

// initScraperCache initializes a new scraper cache and returns it.
func initScraperCache() *scraper.Cache {
scraperConfig := scraper.GlobalConfig{
Path: config.GetScrapersPath(),
UserAgent: config.GetScraperUserAgent(),
CDPPath: config.GetScraperCDPPath(),
}
ret, err := scraper.NewCache(scraperConfig)

Expand All @@ -178,3 +180,9 @@ func (s *singleton) RefreshConfig() {
paths.EnsureJSONDirs()
}
}

// RefreshScraperCache refreshes the scraper cache. Call this when scraper
// configuration changes.
func (s *singleton) RefreshScraperCache() {
s.ScraperCache = initScraperCache()
}
8 changes: 8 additions & 0 deletions pkg/scraper/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ type config struct {

// Xpath scraping configurations
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`

// Scraping driver options
DriverOptions *scraperDriverOptions `yaml:"driver"`
}

func (c config) validate() error {
Expand Down Expand Up @@ -135,6 +138,11 @@ type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}

type scraperDriverOptions struct {
UseCDP bool `yaml:"useCDP"`
Sleep int `yaml:"sleep"`
}

func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
ret := &config{}

Expand Down
14 changes: 13 additions & 1 deletion pkg/scraper/scrapers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"path/filepath"
"strconv"
"strings"

"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
Expand All @@ -14,7 +15,18 @@ import (
type GlobalConfig struct {
// User Agent used when scraping using http.
UserAgent string
Path string

// Path (file or remote address) to a Chrome CDP instance.
CDPPath string
Path string
}

func (c GlobalConfig) isCDPPathHTTP() bool {
return strings.HasPrefix(c.CDPPath, "http://") || strings.HasPrefix(c.CDPPath, "https://")
}

func (c GlobalConfig) isCDPPathWS() bool {
return strings.HasPrefix(c.CDPPath, "ws://")
}

// Cache stores scraper details.
Expand Down
175 changes: 175 additions & 0 deletions pkg/scraper/url.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package scraper

import (
"context"
"errors"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/http/cookiejar"
"os"
"strings"
"time"

"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"github.com/stashapp/stash/pkg/logger"
"golang.org/x/net/html/charset"
"golang.org/x/net/publicsuffix"
)

func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
// get the page using chrome dp
return urlFromCDP(url, *driverOptions, globalConfig)
}

// get the page using http.Client
options := cookiejar.Options{
PublicSuffixList: publicsuffix.List,
}
jar, er := cookiejar.New(&options)
if er != nil {
return nil, er
}

client := &http.Client{
Timeout: scrapeGetTimeout,
// defaultCheckRedirect code with max changed from 10 to 20
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 20 {
return errors.New("stopped after 20 redirects")
}
return nil
},
Jar: jar,
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

userAgent := globalConfig.UserAgent
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}

resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

return charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
}

// func urlFromCDP uses chrome cdp and DOM to load and process the url
// if remote is set as true in the scraperConfig it will try to use localhost:9222
// else it will look for google-chrome in path
func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
const defaultSleep = 2

if !driverOptions.UseCDP {
return nil, fmt.Errorf("Url shouldn't be feetched through CDP")
}

sleep := defaultSleep

if driverOptions.Sleep != 0 {
sleep = driverOptions.Sleep
}

sleepDuration := time.Duration(sleep) * time.Second
act := context.Background()

// if scraperCDPPath is a remote address, then allocate accordingly
if globalConfig.CDPPath != "" {
var cancelAct context.CancelFunc

if globalConfig.isCDPPathHTTP() || globalConfig.isCDPPathWS() {
remote := globalConfig.CDPPath

// if CDPPath is http(s) then we need to get the websocket URL
if globalConfig.isCDPPathHTTP() {
var err error
remote, err = getRemoteCDPWSAddress(remote)
if err != nil {
return nil, err
}
}

act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote)
} else {
// user a temporary user directory for chrome
dir, err := ioutil.TempDir("", "stash-chromedp")
if err != nil {
return nil, err
}
defer os.RemoveAll(dir)

opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.UserDataDir(dir),
chromedp.ExecPath(globalConfig.CDPPath),
)
act, cancelAct = chromedp.NewExecAllocator(act, opts...)
}

defer cancelAct()
}

ctx, cancel := chromedp.NewContext(act)
defer cancel()

var res string
err := chromedp.Run(ctx,
network.Enable(),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
return err
}
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
)
if err != nil {
return nil, err
}

return strings.NewReader(res), nil
}

// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
func getRemoteCDPWSAddress(address string) (string, error) {
resp, err := http.Get(address)
if err != nil {
return "", err
}

var result map[string]interface{}
var json = jsoniter.ConfigCompatibleWithStandardLibrary
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", err
}
remote := result["webSocketDebuggerUrl"].(string)
logger.Debugf("Remote cdp instance found %s", remote)
return remote, err
}

func cdpNetwork(enable bool) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
if enable {
network.Enable().Do(ctx)
} else {
network.Disable().Do(ctx)
}
return nil
})
}
Loading

0 comments on commit 5beda52

Please sign in to comment.