Skip to content

Commit

Permalink
fixed bug with xml generation and language identification
Browse files Browse the repository at this point in the history
  • Loading branch information
isanvicente committed Sep 6, 2023
1 parent 3c3db23 commit dfced85
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
12 changes: 7 additions & 5 deletions src/main/java/elh/eus/MSM/DocumentParserBPipePlus.java
Original file line number Diff line number Diff line change
Expand Up @@ -813,19 +813,21 @@ private String extractLang(TextDocument doc) {
{
result=result.substring(0, 2);
}
System.err.println("DocumentParserBpipePlus::extractLang - lang after xpaths: "+result);
//System.err.println("DocumentParserBpipePlus::extractLang - lang after xpaths: "+result);

//2. language detection
if (result == null ) {
if (result.length() < 2 ) {
result= LID.detectFeedLanguage(doc.getContent(), getFeedLangs())[0];
//System.err.println("DocumentParserBpipePlus::extractLang - LID lang : "+result);
}

//some previous functions may return null. Be sure and empty string is returned and not null, otherwise org.json won't print the property
if (result == null ) {
result="";
if (result == null || result.length() < 2) {
result="unk";
}


System.err.println("DocumentParserBpipePlus::extractLang - lang : "+result);


return result;
}
Expand Down
9 changes: 4 additions & 5 deletions src/main/java/elh/eus/MSM/FeedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -590,11 +590,7 @@ private void getRssFeed (Feed f, String store){
//if language accepted parse article for mentions. If found store them to DB or print them
if (acceptedLangs.contains("all") || acceptedLangs.contains(lang))
{
PrintOptions printoptions= new PrintOptions();
printoptions.setOrientation(PrintOptions.Orientation.LANDSCAPE);
printoptions.setScale(0.9);
printoptions.setPageSize(new PageSize(297,210));


if (kwrds.isEmpty())
{
System.err.println("MSM::FeadReader::getFeed ->no keywords provided full articles will be returned");
Expand All @@ -616,6 +612,9 @@ private void getRssFeed (Feed f, String store){
}
}
}
else {
System.err.println("FeadReader::getFeed -> lang not accepted! "+lang+" -> "+link);
}
}
// else
// {
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/elh/eus/MSM/MSMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
import twitter4j.JSONException;
import twitter4j.JSONObject;

import org.jsoup.Jsoup;
import org.jsoup.parser.Parser;

//import com.mysql.jdbc.jdbc2.optional.MysqlDataSource;
Expand Down Expand Up @@ -529,6 +530,11 @@ public static boolean saveHtml2pdf(FeedArticle in, String storePath, String link
"<p>"+in.getText()+"</p></body></html>";
break;
}
//clean and make html valid xml
org.jsoup.nodes.Document doc= Jsoup.parse(validxml);
doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml);
validxml=doc.toString();
/*
//clean and make html valid xml
org.jsoup.nodes.Document doc= Jsoup.parse(in,link);
Expand All @@ -539,7 +545,7 @@ public static boolean saveHtml2pdf(FeedArticle in, String storePath, String link
//}
doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml);
String validxml=doc.toString();*/
String validxml=doc.toString();*/
//System.err.println("MSMUtils::saveHtml2pdf -> xml to store: "+validxml);
//p.print(validxml);
//p.close();
Expand Down

0 comments on commit dfced85

Please sign in to comment.