-
Notifications
You must be signed in to change notification settings - Fork 0
/
main2.py
214 lines (180 loc) · 6.2 KB
/
main2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#Document Enhancer
import docx
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
import random
from docx.shared import RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_BREAK
import re
import os
import sys
import torch
import random
import argparse
import numpy as np
from gpt2Pytorch.GPT2.model import (GPT2LMHeadModel)
from gpt2Pytorch.GPT2.utils import load_weight
from gpt2Pytorch.GPT2.config import GPT2Config
from gpt2Pytorch.GPT2.sample import sample_sequence
from gpt2Pytorch.GPT2.encoder import get_encoder
from gpt2Pytorch.mainLib import *
paragraph = 0
word = 0
titleLine=0
document = Document()
doc = docx.Document('test.docx')
def formatStyles():
style = document.styles['Normal']
font = style.font
font.name = 'Times New Roman'
font.size = Pt(12)
font.color.rgb = RGBColor(0,0,0)
font.underline = False
style.paragraph_format.line_spacing = 2
styles = document.styles
styles['Title'].delete()
style = styles.add_style('Title', WD_STYLE_TYPE.PARAGRAPH)
style = document.styles['Title']
font = style.font
font.name = 'Times New Roman'
font.size = Pt(20)
font.color.rgb = RGBColor(0,0,0)
font.underline = False
style.paragraph_format.line_spacing = 2
def addheader():
section = document.sections[0]
header = section.header
head = header.paragraphs[0]
list = readParagraph(0)
#document.add_paragraph("Testing", )
#name = list[0]
head.text = list.split( )[1]
#head.text = readWord(0,1)
head.alignment = WD_ALIGN_PARAGRAPH.RIGHT
def addheading():
list = readHeading()
#print(list)
#headingDate()
#for i in list:
# heading = document.add_paragraph(i)
heading = document.add_paragraph(list[0])
heading = document.add_paragraph(list[1])
heading = document.add_paragraph(list[2])
heading = document.add_paragraph(headingDate(list[3]))
heading.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading.paragraph_format.line_spacing = 2
def addtitle():
#document.add_heading('Document Title', 0)
title = document.add_paragraph(readParagraph(titleLine), style='Title')
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
#Adds body paragraphs into the body with new format.
def addbody():
list = readBody()
for i in list:
#print(list[i])
paragraph = document.add_paragraph('\t' + i[:-2])
for paragraph_text in AIconverter(readBody()[0]).split('\n\n'):
#print(paragraph_text.strip())
paragraph = document.add_paragraph("\t"+paragraph_text.strip())
#body.append(paragraph(paragraph_text.strip()))
#paragraph = document.add_paragraph(AItext)
paragraph_format = paragraph.paragraph_format
paragraph_format.line_spacing = 2
#adds new page and titles it "Works Cited"
def addReferences():
document.add_page_break()
wstitle = document.add_paragraph('Works Cited', style='Title')
wstitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
#returns heading as list
def readHeading():
global titleLine
#doc = docx.Document('test.docx')
#headingList = [doc.paragraphs[0].text, doc.paragraphs[1].text, doc.paragraphs[2].text, doc.paragraphs[3].text]
headingList=[]
i = 0
while i <= 4:
#print(doc.paragraphs[i].alignment)
if doc.paragraphs[i].alignment != WD_ALIGN_PARAGRAPH.CENTER:
#print(str(i)+" line is not title")
headingList.append(doc.paragraphs[i].text)
else:
if i < 4:
headingList = ["FIRST LAST", "(PROF,Mr.,Mrs,Ms) NAME", "CLASS", "DD MMM YYYY"]
titleLine = i
break
i=i+1
print(headingList)
return headingList;
#returns any paragraph by its number
def readParagraph(paragraph):
#doc = docx.Document('test.docx')
return doc.paragraphs[paragraph].text
#returns list of all paragraphs after Title
def readBody():
#doc = docx.Document('test.docx')
list = []
i=titleLine+1
print(i)
while i < len(doc.paragraphs):
list.append(doc.paragraphs[i].text)
print(list)
i+=1
return list
def headingDate(date):
#print(date)
#date="mar 2020 24"
try:
day = re.search("([^\d])([0-2]|[0-2][0-9])([^\d])" , " "+date+" ")
year = re.search("[2-9][0-9][0-9][0-9]" , date)
month = re.search("[^\s\d][^\s\d][^\s\d]" , date)
newdate=day.group()[1:-1]+" "+month.group().capitalize()+". "+year.group()
#print(newdate)
except:
print("error, unable to corrct date ... ignoring")
newdate=date
return newdate
#returns true if a word is ignorable and false if important
def ignorable(word):
ignore = ["The", "the", "To", "to", "Of", "of", "Be", "be", "and", "A", "a", "That", "that", "Have", "have", "I",
"It", "it", "For", "for", "Not", "not", "With", "with", "You", "you", "As", "as", "Do", "do", "At", "at"
"This", "this", "By", "by", "or", "An", "an", "From", "from", "Will", "will", "Is", "is"]
for x in range( 0, len(ignore) ):
if(word == ignore[x]):
return True
return False
#takes in a string and returns a dictionary on the word count of each word
def getRepetitive( text ):
unique = {}
for word in text:
if ignorable(word) == False:
if len(unique) == 0:
unique[word] = 1
else:
for y in list(unique):
if(word == y):
unique[word] += 1
break
unique[word] = 1
return unique
# Hunter Testing
def phrases(string):
words = string.split()
result = []
for number in range(len(words)):
for start in range(len(words)-number):
result.append(" ".join(words[start:start+number+1]))
return result
#MAIN
def main():
formatStyles()
#addheader()
addheading()
addtitle()
#readBody() #just testing
addbody()
addReferences()
document.save('mla.docx')
main()