Skip to content

Commit

Permalink
articles saved to pdf. subscription bugs fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
isanvicente committed Aug 7, 2023
1 parent 0582fbb commit f545b49
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 16 deletions.
55 changes: 55 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
<project.custom.java.version>1.8</project.custom.java.version>
<maven.compiler.source>${project.custom.java.version}</maven.compiler.source>
<maven.compiler.target>${project.custom.java.version}</maven.compiler.target>
<!-- Define the version of OPEN HTML TO PDF in the properties section of your POM. -->
<openhtml.version>1.0.10</openhtml.version>
</properties>
<prerequisites>
<maven>3.0</maven>
Expand Down Expand Up @@ -176,6 +178,59 @@
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>

<dependency>
<!-- ALWAYS required, usually included transitively. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-core</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Required for PDF output. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-pdfbox</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Required for image output only. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-java2d</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Optional, leave out if you do not need right-to-left or bi-directional text support. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-rtl-support</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Optional, leave out if you do not need logging via slf4j. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-slf4j</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Optional, leave out if you do not need SVG support. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-svg-support</artifactId>
<version>${openhtml.version}</version>
</dependency>

<dependency>
<!-- Optional, leave out if you do not need MathML support. -->
<!-- Introduced in RC-13. -->
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-mathml-support</artifactId>
<version>${openhtml.version}</version>
</dependency>



</dependencies>


Expand Down
49 changes: 33 additions & 16 deletions src/main/java/elh/eus/MSM/FeedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,10 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.naming.NamingException;

Expand Down Expand Up @@ -123,12 +125,13 @@
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.ElementNotInteractableException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;


import com.openhtmltopdf.pdfboxout.PdfRendererBuilder;

/**
* RSS/Atom feed reader.
Expand All @@ -148,6 +151,7 @@ public class FeedReader {
private Set<Keyword> independentkwrds = new HashSet<Keyword>();
private Set<Keyword> dependentkwrds = new HashSet<Keyword>();
private Set<Long> census = new HashSet<Long>();
private String fileStorePath = "";

private static Pattern anchorPattern; //pattern for anchor kwrds. they are usually general terms.
private HashMap<Integer,Pattern> kwrdPatterns = new HashMap<Integer,Pattern>(); //patterns for keywords.
Expand Down Expand Up @@ -345,6 +349,8 @@ public FeedReader(String config, Set<Feed> feedList, Set<Keyword> kwrdList, Stri
+ "If a source requires authentication related articles may not be downloaded correctly.");
}

fileStorePath=params.getProperty("fileStorePath", "/tmp/");

}//end constructor


Expand Down Expand Up @@ -577,7 +583,16 @@ private void getRssFeed (Feed f, String store){
else
{
//processFullArticle(doc,lang, pubDate, link, f.getSrcId(), store);
parseArticleForKeywords(doc,lang, pubDate, link, f.getSrcId(), store);
boolean mentionsFound=parseArticleForKeywords(doc,lang, pubDate, link, f.getSrcId(), store);
//albisteak aipamenik bazuen gorde albistearen pdf-a
if (mentionsFound) {
OutputStream os = new FileOutputStream(fileStorePath+link);
PdfRendererBuilder builder = new PdfRendererBuilder();
builder.useFastMode();
builder.withHtmlContent(is.toString(),link); //)withUri(is);
builder.toStream(os);
builder.run();
}
}
}
}
Expand Down Expand Up @@ -784,10 +799,10 @@ private void getMultimediaFeed (Feed f, String store, String ffmpeg, float split
* @param link
* @param srcId
*/
private void parseArticleForKeywords(TextDocument doc, String lang, Date date, String link, long srcId, String store) {
private boolean parseArticleForKeywords(TextDocument doc, String lang, Date date, String link, long srcId, String store) {

Set<Keyword> result = new HashSet<Keyword>();

boolean mentionsFound=false;
String wholeText = StringUtils.stripAccents(doc.getContent()).toLowerCase();
boolean anchorFound = false;
if (anchorPattern == null)
Expand Down Expand Up @@ -893,6 +908,7 @@ private void parseArticleForKeywords(TextDocument doc, String lang, Date date, S

if (result != null && !result.isEmpty())
{
mentionsFound=true;
Mention m = new Mention(lang,par,date,link,srcId,true);
m.setKeywords(result);
if (store.equalsIgnoreCase("db"))
Expand All @@ -907,6 +923,7 @@ private void parseArticleForKeywords(TextDocument doc, String lang, Date date, S
}
}
} //for each paragraph
return mentionsFound;
}

/**
Expand Down Expand Up @@ -1211,13 +1228,13 @@ private InputSource fetchHTML(URL linkSrc, CookieStore cst) throws IOException,
/**/
boolean startSelenium(FeedCredential cred)
{
System.setProperty("webdriver.chrome.driver",params.getProperty("chromedriverPath", "chromedriver"));
//System.setProperty("webdriver.chrome.bin", "/usr/bin/google-chrome-beta");
ChromeOptions seleniumOptions = new ChromeOptions();
String[] seleniumOpts=params.getProperty("seleniumOptions","");
if (! seleniumOpts.equalsIgnoreCase("")){
for (String o : seleniumOpts.split(";")){
seleniumOptions.addArguments(o);
System.setProperty("webdriver.chrome.driver",params.getProperty("chromedriverPath", "chromedriver"));
//System.setProperty("webdriver.chrome.bin", "/usr/bin/google-chrome-beta");
ChromeOptions seleniumOptions = new ChromeOptions();
String seleniumOpts=params.getProperty("seleniumOptions","");
if (! seleniumOpts.equalsIgnoreCase("")){
for (String o : seleniumOpts.split(";")){
seleniumOptions.addArguments(o);
}
}
seleniumOptions.setBinary("/usr/bin/google-chrome-beta");
Expand All @@ -1231,9 +1248,9 @@ boolean startSelenium(FeedCredential cred)
seleniumDriver.close();
seleniumDriver=new ChromeDriver(seleniumOptions);
seleniumDriver.get(cred.getSsourl());
}catch (WebDriverException se){
System.err.println("FeadReader::getRssFeed -> selenium could not open login page proceeding without it");
return 0;
}catch (WebDriverException se2){
System.err.println("FeadReader::getRssFeed -> selenium could not open login page proceeding without it");
return false;
}

}
Expand Down Expand Up @@ -1265,10 +1282,10 @@ boolean startSelenium(FeedCredential cred)
seleniumDriver.findElement(By.id(cred.getPassField())).sendKeys(cred.getSsopass() + Keys.ENTER);
}catch (ElementNotInteractableException nie){
System.err.println("FeadReader::getRssFeed -> selenium found an element not clickable, proceeding without login");
return 0;
return false;
}

return 1;
return true;
}


Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ feedType="press"
#feedAuth is used to pass credentials for one or more feeds, in the format domain1::usr1::pass1::usrField1::passField1::cookienotice1;domain2::usr2:pass2::usrField2::passField2::cookienotice2
feedAuth=www.example.com::https://www.example.com/login::email@example.com::examplePASS::user_name_or_email_field::user_password_field:://xpathTo[@class='cookienotize ok button']

newsStorePath=/path/to/folder/for/storing/found/news/

## Search Engine API connection info
BingKey="BingAPIkey"

Expand Down

0 comments on commit f545b49

Please sign in to comment.