From f545b4957b6fcf4e72abfd95bac7fb6d8ca61dbb Mon Sep 17 00:00:00 2001 From: isanvicente Date: Mon, 7 Aug 2023 15:04:07 +0200 Subject: [PATCH] articles saved to pdf. subscription bugs fixed --- pom.xml | 55 +++++++++++++++++++++++ src/main/java/elh/eus/MSM/FeedReader.java | 49 +++++++++++++------- src/main/resources/example.cfg | 2 + 3 files changed, 90 insertions(+), 16 deletions(-) diff --git a/pom.xml b/pom.xml index c2802d5..1958a88 100644 --- a/pom.xml +++ b/pom.xml @@ -23,6 +23,8 @@ 1.8 ${project.custom.java.version} ${project.custom.java.version} + + 1.0.10 3.0 @@ -176,6 +178,59 @@ guava 31.1-jre + + + + com.openhtmltopdf + openhtmltopdf-core + ${openhtml.version} + + + + + com.openhtmltopdf + openhtmltopdf-pdfbox + ${openhtml.version} + + + + + com.openhtmltopdf + openhtmltopdf-java2d + ${openhtml.version} + + + + + com.openhtmltopdf + openhtmltopdf-rtl-support + ${openhtml.version} + + + + + com.openhtmltopdf + openhtmltopdf-slf4j + ${openhtml.version} + + + + + com.openhtmltopdf + openhtmltopdf-svg-support + ${openhtml.version} + + + + + + com.openhtmltopdf + openhtmltopdf-mathml-support + ${openhtml.version} + + + + diff --git a/src/main/java/elh/eus/MSM/FeedReader.java b/src/main/java/elh/eus/MSM/FeedReader.java index fbac62b..90fbec3 100644 --- a/src/main/java/elh/eus/MSM/FeedReader.java +++ b/src/main/java/elh/eus/MSM/FeedReader.java @@ -60,8 +60,10 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import javax.naming.NamingException; @@ -123,12 +125,13 @@ import org.openqa.selenium.TimeoutException; import org.openqa.selenium.ElementNotInteractableException; import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebDriverException; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.support.ui.ExpectedConditions; import org.openqa.selenium.support.ui.WebDriverWait; - +import com.openhtmltopdf.pdfboxout.PdfRendererBuilder; /** * RSS/Atom feed reader. @@ -148,6 +151,7 @@ public class FeedReader { private Set independentkwrds = new HashSet(); private Set dependentkwrds = new HashSet(); private Set census = new HashSet(); + private String fileStorePath = ""; private static Pattern anchorPattern; //pattern for anchor kwrds. they are usually general terms. private HashMap kwrdPatterns = new HashMap(); //patterns for keywords. @@ -345,6 +349,8 @@ public FeedReader(String config, Set feedList, Set kwrdList, Stri + "If a source requires authentication related articles may not be downloaded correctly."); } + fileStorePath=params.getProperty("fileStorePath", "/tmp/"); + }//end constructor @@ -577,7 +583,16 @@ private void getRssFeed (Feed f, String store){ else { //processFullArticle(doc,lang, pubDate, link, f.getSrcId(), store); - parseArticleForKeywords(doc,lang, pubDate, link, f.getSrcId(), store); + boolean mentionsFound=parseArticleForKeywords(doc,lang, pubDate, link, f.getSrcId(), store); + //albisteak aipamenik bazuen gorde albistearen pdf-a + if (mentionsFound) { + OutputStream os = new FileOutputStream(fileStorePath+link); + PdfRendererBuilder builder = new PdfRendererBuilder(); + builder.useFastMode(); + builder.withHtmlContent(is.toString(),link); //)withUri(is); + builder.toStream(os); + builder.run(); + } } } } @@ -784,10 +799,10 @@ private void getMultimediaFeed (Feed f, String store, String ffmpeg, float split * @param link * @param srcId */ - private void parseArticleForKeywords(TextDocument doc, String lang, Date date, String link, long srcId, String store) { + private boolean parseArticleForKeywords(TextDocument doc, String lang, Date date, String link, long srcId, String store) { Set result = new HashSet(); - + boolean mentionsFound=false; String wholeText = StringUtils.stripAccents(doc.getContent()).toLowerCase(); boolean anchorFound = false; if (anchorPattern == null) @@ -893,6 +908,7 @@ private void parseArticleForKeywords(TextDocument doc, String lang, Date date, S if (result != null && !result.isEmpty()) { + mentionsFound=true; Mention m = new Mention(lang,par,date,link,srcId,true); m.setKeywords(result); if (store.equalsIgnoreCase("db")) @@ -907,6 +923,7 @@ private void parseArticleForKeywords(TextDocument doc, String lang, Date date, S } } } //for each paragraph + return mentionsFound; } /** @@ -1211,13 +1228,13 @@ private InputSource fetchHTML(URL linkSrc, CookieStore cst) throws IOException, /**/ boolean startSelenium(FeedCredential cred) { - System.setProperty("webdriver.chrome.driver",params.getProperty("chromedriverPath", "chromedriver")); - //System.setProperty("webdriver.chrome.bin", "/usr/bin/google-chrome-beta"); - ChromeOptions seleniumOptions = new ChromeOptions(); - String[] seleniumOpts=params.getProperty("seleniumOptions",""); - if (! seleniumOpts.equalsIgnoreCase("")){ - for (String o : seleniumOpts.split(";")){ - seleniumOptions.addArguments(o); + System.setProperty("webdriver.chrome.driver",params.getProperty("chromedriverPath", "chromedriver")); + //System.setProperty("webdriver.chrome.bin", "/usr/bin/google-chrome-beta"); + ChromeOptions seleniumOptions = new ChromeOptions(); + String seleniumOpts=params.getProperty("seleniumOptions",""); + if (! seleniumOpts.equalsIgnoreCase("")){ + for (String o : seleniumOpts.split(";")){ + seleniumOptions.addArguments(o); } } seleniumOptions.setBinary("/usr/bin/google-chrome-beta"); @@ -1231,9 +1248,9 @@ boolean startSelenium(FeedCredential cred) seleniumDriver.close(); seleniumDriver=new ChromeDriver(seleniumOptions); seleniumDriver.get(cred.getSsourl()); - }catch (WebDriverException se){ - System.err.println("FeadReader::getRssFeed -> selenium could not open login page proceeding without it"); - return 0; + }catch (WebDriverException se2){ + System.err.println("FeadReader::getRssFeed -> selenium could not open login page proceeding without it"); + return false; } } @@ -1265,10 +1282,10 @@ boolean startSelenium(FeedCredential cred) seleniumDriver.findElement(By.id(cred.getPassField())).sendKeys(cred.getSsopass() + Keys.ENTER); }catch (ElementNotInteractableException nie){ System.err.println("FeadReader::getRssFeed -> selenium found an element not clickable, proceeding without login"); - return 0; + return false; } - return 1; + return true; } diff --git a/src/main/resources/example.cfg b/src/main/resources/example.cfg index 2b66478..74c5871 100644 --- a/src/main/resources/example.cfg +++ b/src/main/resources/example.cfg @@ -32,6 +32,8 @@ feedType="press" #feedAuth is used to pass credentials for one or more feeds, in the format domain1::usr1::pass1::usrField1::passField1::cookienotice1;domain2::usr2:pass2::usrField2::passField2::cookienotice2 feedAuth=www.example.com::https://www.example.com/login::email@example.com::examplePASS::user_name_or_email_field::user_password_field:://xpathTo[@class='cookienotize ok button'] +newsStorePath=/path/to/folder/for/storing/found/news/ + ## Search Engine API connection info BingKey="BingAPIkey"