From dfced8543c328f0775899062c33a78084f773761 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Wed, 6 Sep 2023 17:48:52 +0200 Subject: [PATCH] fixed bug with xml generation and language identification --- .../java/elh/eus/MSM/DocumentParserBPipePlus.java | 12 +++++++----- src/main/java/elh/eus/MSM/FeedReader.java | 9 ++++----- src/main/java/elh/eus/MSM/MSMUtils.java | 8 +++++++- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/main/java/elh/eus/MSM/DocumentParserBPipePlus.java b/src/main/java/elh/eus/MSM/DocumentParserBPipePlus.java index eead859..0c18ee8 100644 --- a/src/main/java/elh/eus/MSM/DocumentParserBPipePlus.java +++ b/src/main/java/elh/eus/MSM/DocumentParserBPipePlus.java @@ -813,19 +813,21 @@ private String extractLang(TextDocument doc) { { result=result.substring(0, 2); } - System.err.println("DocumentParserBpipePlus::extractLang - lang after xpaths: "+result); + //System.err.println("DocumentParserBpipePlus::extractLang - lang after xpaths: "+result); //2. language detection - if (result == null ) { + if (result.length() < 2 ) { result= LID.detectFeedLanguage(doc.getContent(), getFeedLangs())[0]; + //System.err.println("DocumentParserBpipePlus::extractLang - LID lang : "+result); } //some previous functions may return null. Be sure and empty string is returned and not null, otherwise org.json won't print the property - if (result == null ) { - result=""; + if (result == null || result.length() < 2) { + result="unk"; } - + System.err.println("DocumentParserBpipePlus::extractLang - lang : "+result); + return result; } diff --git a/src/main/java/elh/eus/MSM/FeedReader.java b/src/main/java/elh/eus/MSM/FeedReader.java index 58bd4c8..a04bbf7 100644 --- a/src/main/java/elh/eus/MSM/FeedReader.java +++ b/src/main/java/elh/eus/MSM/FeedReader.java @@ -590,11 +590,7 @@ private void getRssFeed (Feed f, String store){ //if language accepted parse article for mentions. If found store them to DB or print them if (acceptedLangs.contains("all") || acceptedLangs.contains(lang)) { - PrintOptions printoptions= new PrintOptions(); - printoptions.setOrientation(PrintOptions.Orientation.LANDSCAPE); - printoptions.setScale(0.9); - printoptions.setPageSize(new PageSize(297,210)); - + if (kwrds.isEmpty()) { System.err.println("MSM::FeadReader::getFeed ->no keywords provided full articles will be returned"); @@ -616,6 +612,9 @@ private void getRssFeed (Feed f, String store){ } } } + else { + System.err.println("FeadReader::getFeed -> lang not accepted! "+lang+" -> "+link); + } } // else // { diff --git a/src/main/java/elh/eus/MSM/MSMUtils.java b/src/main/java/elh/eus/MSM/MSMUtils.java index 302c1e8..fd978a8 100644 --- a/src/main/java/elh/eus/MSM/MSMUtils.java +++ b/src/main/java/elh/eus/MSM/MSMUtils.java @@ -73,6 +73,7 @@ import twitter4j.JSONException; import twitter4j.JSONObject; +import org.jsoup.Jsoup; import org.jsoup.parser.Parser; //import com.mysql.jdbc.jdbc2.optional.MysqlDataSource; @@ -529,6 +530,11 @@ public static boolean saveHtml2pdf(FeedArticle in, String storePath, String link "

"+in.getText()+"

"; break; } + //clean and make html valid xml + org.jsoup.nodes.Document doc= Jsoup.parse(validxml); + doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); + doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml); + validxml=doc.toString(); /* //clean and make html valid xml org.jsoup.nodes.Document doc= Jsoup.parse(in,link); @@ -539,7 +545,7 @@ public static boolean saveHtml2pdf(FeedArticle in, String storePath, String link //} doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml); - String validxml=doc.toString();*/ + String validxml=doc.toString();*/ //System.err.println("MSMUtils::saveHtml2pdf -> xml to store: "+validxml); //p.print(validxml); //p.close();