-
Notifications
You must be signed in to change notification settings - Fork 9
/
SemantriaWrapper.cs
140 lines (118 loc) · 3.71 KB
/
SemantriaWrapper.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Semantria.Com;
using Semantria.Com.Serializers;
using Semantria.Com.Mapping;
using Semantria.Com.Mapping.Configuration;
using Semantria.Com.Mapping.Output;
using Clifton.Assertions;
using Clifton.ExtensionMethods;
namespace NlpComparison
{
public class SemantriaWrapper
{
protected string consumerKey;
protected string consumerSecret;
protected JsonSerializer serializer;
protected Session session;
// https://semantria.com/support/developer/overview/processing
protected List<DocAnalyticData> docResults;
protected Configuration config;
protected string configID = null;
public SemantriaWrapper()
{
}
public void Initialize()
{
string apikey = File.ReadAllText("semantriaapikey.txt");
string[] keys = apikey.Split('\r');
consumerKey = keys[0].Trim();
consumerSecret = keys[1].Trim();
serializer = new JsonSerializer();
session = Session.CreateSession(consumerKey, consumerSecret, serializer);
// IncreaseLimits();
}
public void ParseUrl(string content)
{
// Document process rather than collection processing.
string docId = Guid.NewGuid().ToString();
Document doc = new Document() {Id = docId, Text = content};
docResults = new List<DocAnalyticData>();
int result = session.QueueDocument(doc, configID);
DocAnalyticData ret;
DateTime start = DateTime.Now;
do
{
// Semantria guarantees a result within 10 seconds. But how fast is it really?
Thread.Sleep(100);
ret = session.GetDocument(doc.Id, configID);
if ((DateTime.Now - start).TotalSeconds > 15)
{
throw new ApplicationException("Semantria did not return with 15 seconds.");
}
} while (ret.Status == Semantria.Com.TaskStatus.QUEUED);
if (ret.Status == Semantria.Com.TaskStatus.PROCESSED)
{
docResults.Add(ret);
}
else
{
throw new ApplicationException("Error processing document: " + ret.Status.ToString());
}
}
public IList GetEntities()
{
List<DocEntity> entities = new List<DocEntity>();
docResults.ForEach(d => entities.AddRange(d.Entities));
return entities;
}
public IList GetThemes()
{
List<DocTheme> themes = new List<DocTheme>();
docResults.ForEach(d => themes.AddRange(d.Themes));
return themes;
}
public IList GetTopics()
{
List<DocTopic> topics = new List<DocTopic>();
docResults.ForEach(d => topics.AddRange(d.Topics));
return topics;
}
protected void IncreaseLimits()
{
// This takes considerable time to get the configurations back from the server.
List<Configuration> configurations = session.GetConfigurations();
config = configurations.FirstOrDefault(item => item.Language.Equals("English"));
if (config != null)
{
config.Document.NamedEntitiesLimit = 50;
config.Document.ConceptTopicsLimit = 50;
config.Document.EntityThemesLimit = 50;
session.UpdateConfigurations(new List<Configuration>() { config });
}
}
/// <summary>
/// The content needs to be split into small chunks, otherwise an exception is thrown (line too long.)
/// </summary>
protected Collection SplitContent(string content)
{
Collection collection = new Collection() { Id = Guid.NewGuid().ToString(), Documents = new List<string>() };
content.Split('\n').ForEach(s =>
{
string trimmed = s.Trim();
// Ignore empty lines.
if (!String.IsNullOrEmpty(trimmed))
{
collection.Documents.Add(trimmed);
}
});
return collection;
}
}
}