-
Notifications
You must be signed in to change notification settings - Fork 2
/
whatsnext.py
144 lines (112 loc) · 7.86 KB
/
whatsnext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
import random
from datetime import datetime
import time as time
import tweepy
import csv
#Basic string declarations
alphabets='’abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
punctuations='\'(",;)”“'
final_output=''
#Declaration of dictionary 'follow' -- stores what string(value) follows another string(key)
follow={}
#Function to return the current time in milliseconds
def millis():
return int(round(time.time() * 1000))
#Function to tokenize the training set -- takes in a parameter, the directory of the txt file.
#Returns a list of tokens extracted from training set
#Twitter tweepy stuff
auth=tweepy.OAuthHandler('YyOFVAjYklqyXXXXXXXXX','XXXXXXXXXXXXXXXXXXXXXXxqIiPkrXWhbQMrDDYfJB')
auth.set_access_token('9249194698XXXXXXXXXXXXXXXXXXXXXf','RXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXHnPNBppR0UbheKf')
api=tweepy.API(auth)
def tokenize(training_set_dir):
tok=[] #initialize temporary token list
buf='' #initialize buffer string
training_set='' #initialize string to store read in text
with open(training_set_dir,'r') as f: #open file
for words in f: #iterate over file
if len(words)>0: #if its a valid word
training_set+=words #append it to training set string
for i in training_set: #iterate over training set string
if i in punctuations:
continue
if i==' ' or i=='.': #if currently at a space
if len(buf)>0 and buf!=' ' and buf!='': #if buffer string is a valid word
tok.append(buf) #append to token list
if i=='.':
tok.append('.')
buf='' #reassign buf as empty
elif i in alphabets: #else if its an alphabet
buf+=i #concatenate to buffer string
tokens=tok
return tokens #return tokens
#Function to construct the n-token follow dictionary
#returns a dictionary storing which words follow which words and how many times they do so
def construct(n=1): #look n-tokens ahead
global follow #reference the global dictionary for internal use
tokens=tokenize('A:/markov.txt') #obtain tokens
keys=[] #init keys of follow
for i in range(len(tokens)-n+1): #generate keys
keys.append(' '.join(tokens[i:i+n]))
keys=list(set(keys)) #make keys unique
for i in range(len(tokens)): #Iterate over tokens
if i+n<len(tokens):
for j in range(0,n):
follow[' '.join(tokens[i+j:i+n])]={} #initialize follow's keys
for i in range(len(tokens)): #Re-iterate over tokens
if i+n<len(tokens): #if n-tokens ahead of current point
for j in range(0,n):
follow[' '.join(tokens[i+j:i+n])][tokens[i+n]]=0 #initialize dictionary value to zero
for i in range(len(tokens)): #Over all tokens
if i+n<len(tokens): #if n-tokens ahead
for j in range(0,n):
follow[' '.join(tokens[i+j:i+n])][tokens[i+n]]+=1 #increment frequency of occurrences
#Function to transition from one token to the next token
def transition(string,order=1,depth=1,limit=50): #Parameters
global final_output #Declare use of global string final_output
if depth>limit:
final_output+='. ' #End recursion if limit is exceeded
return
if string in follow: #If the given string is a key in follow{}
follow_words=list(follow[string].keys()) #Obtain the next possible words
follow_occur=list(follow[string].values()) #Obtain the probabilites of the next possible words
cum_probs=[0] #Declare list 'cum_probs': cumulative probabilities
cum_probs.append(follow_occur[0]) #Append probability of first next token
#Basically, we're appending likelihoods of next occurrences, and taking a random number and finding
#which segment it falls within, and picking the corresponding production.
#This method ensures that our next token is probabilistically likely to be more meaningful.
for i in range(1,len(follow_words)):
cum_probs.append(cum_probs[i-1]+follow_occur[i]) #Generate cumulative probs
random_ind=millis() % len(follow_words) #Generate random number using system time
ind=0 #Declare final index
for i in range(1,len(follow_words)+1): #Iterate over all following words
if random_ind>=cum_probs[i-1] and random_ind<=cum_probs[i]: #If the bucket has been found where the random number lies in
ind=i-1 #Select the corresponding following token
break
final_output+=follow_words[ind]+' ' #Concatenate it to final string
#Time to move on to next token of given order
transition(' '.join(string.split(" ")[i:])+follow_words[ind],order,depth+1,limit)
#If current token does not exist, exception handle
else:
transition(list(follow.keys())[millis() % len(follow.keys())],order,1,limit)
#Function to execute the above functions
def run():
final_output='' #Final string
#n=int(input("Enter order: ")) #Obtain order of histogram
n=1
depth_limit=500
#depth_limit=int(input('Enter maximum depth of recursion: ')) #Obtain depth limit of recursion
construct(n) #Invoke construction function
transition(list(follow.keys())[millis() % len(follow.keys())],n,1,depth_limit) #Begin recursive descent
#Function to generate the actual tweets
def whats_next():
run() #Invoke run function
ans=''
ans+=final_output[0].upper()
ans+=final_output[1:min(219,max(60,final_output.index('.')+1))]
if ans[len(ans)-1]==' ' and ans[len(ans)-1]!='.':
ans+='...'
else:
ans+='-'
api.update_status(ans)
whats_next()