-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda_train.py
58 lines (46 loc) · 1.77 KB
/
lda_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
# This will load the preprocessed data and train an lda model that we will use in 'lda_save_topics'
# Set blei = True if you want to fit the model with respect to Blei's corpus. blei = False
# will use the speech recordings corpus.
# The lengh of the vocabulary can be set via the num_features variable
blei = True
num_features = 200
if blei:
corpus_path = '../lda-python/data/blei_samples.txt'
vocabulary_path = 'blei/blei_vocabulary.csv'
pickle_path = 'blei/lda_model.pkl'
else:
corpus_path = 'speech/corpus.txt'
vocabulary_path = 'speech/speech_vocabulary.csv'
pickle_path = 'speech/lda_model.pkl'
# Load corpus
corpus = np.loadtxt(corpus_path, delimiter='\n', dtype=str)
# Transform corpus and define vocabulary
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features,
stop_words='english', lowercase=True)
X = count_vectorizer.fit_transform(corpus)
vocabulary = count_vectorizer.get_feature_names()
# Save vocabulary
np.savetxt(vocabulary_path, vocabulary, delimiter=',', fmt='%s')
# Transform to pandas dataframe
df_count = pd.DataFrame(X.todense(), columns=vocabulary)
# Set up grid search
search_params = {
'n_components': [10],
'learning_decay': [.5],
'learning_method': ['batch']
}
lda = LatentDirichletAllocation(max_iter=10, batch_size=24, n_jobs=-1)
model = GridSearchCV(lda, param_grid=search_params)
# Fit model
model.fit(X)
# Best model
best_model = model.best_estimator_
lda_pickle = pickle_path
with open(lda_pickle, 'wb') as file:
pickle.dump(best_model, file)