Skip to content

Text analysis #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .idea/.name

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/Env1.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@ def editor():
return render_template('editor.html')



@app.route('/submit', methods=['Post'])
def submit_data():
data = request.json
words = re.findall(r'\b\w+\b', data.lower())
word_counts = Counter(words)
# print(f'Word count: {word_counts}')
email = extracted_email(data)
phoneNumber = extracted_phoneNumber(data)
print(f'Data: {data}, Email: {email}, Phone: {phoneNumber}')
Expand All @@ -44,5 +42,6 @@ def extracted_phoneNumber(data):



main
if __name__ == '__main__':
app.run(debug=True)
78 changes: 78 additions & 0 deletions pdfReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pypdf import PdfReader
import re


def cleaner(list_array, item):
res = [i for i in list_array if i != item]
return res


def pefReader(data):

reader = PdfReader(data)
content = reader.pages[0].extract_text() + reader.pages[1].extract_text()
content_lower = content.lower()
education_array = []
work_array = []
summary_array = []
personal_data = []
content_array = content_lower.split(' ')
elements = ' '.join(content_array)
pattern_2 = r' {2,}'
results = re.sub(pattern_2, ' ', elements)
result_arr = results.split(' ')
result_arr.remove('')
new_arr = cleaner(result_arr, '')
try:
end_of_summary = new_arr.index('\neducation:')
end_of_edu = new_arr.index('\nwork')
end_of_work = new_arr.index('\nleadership')
end_of_publications = new_arr.index('')
end_of_bio = new_arr.index('\nobjective')

except ValueError:
print('Value does not exist')
try:
for i, x in enumerate(new_arr):
if x.strip() == "education:": # stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search.
for i in range(i, end_of_edu):
education_array.append(new_arr[i])
except IndexError:
print('Educational experience not found')

try:
for a, b in enumerate(new_arr):
if b.strip() == "work" and new_arr[a + 1].strip() == "experience:":
for a in range(a, end_of_work):
work_array.append(new_arr[a])
except IndexError:
print('Work experience not found')

try:
for c, d in enumerate(new_arr):
if d.strip() == "objective" or d.strip() == "summary":
for c in range(c, end_of_summary):
summary_array.append(new_arr[c])
except IndexError:
print('Professional summary not found')

try:
for e, f in enumerate(new_arr):
if f.strip() == "publications" or d.strip() == "projects":
for c in range(c, end_of_publications):
summary_array.append(new_arr[c])
except IndexError:
print('Publications not found')

try:
for g, h in enumerate(new_arr[0:15]):
personal_data.append(h)
except IndexError:
print('Publications not found')

#print(new_arr)
print(f'\n Personal Data: {personal_data} \n\n Candidate Objective: {summary_array} \n\n '
f'Educational experince : {education_array} \n\n Work Experience : {work_array}')


pefReader('MMujtaba-CV.pdf')