Skip to content

Restructure Extraction-related Code #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from collections import Counter
import re
import json
from extract import extracted_email, extracted_phoneNumber, extracted_name, extracted_education, extracted_wrkexp, extracted_summary
import os
from pypdf import PdfReader

app = Flask(__name__)

Expand All @@ -16,30 +19,44 @@ def editor():
return render_template('editor.html')


@app.route('/submit', methods=['Post'])
UPLOAD_FOLDER = 'uploads'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

@app.route('/upload', methods=['POST'])
def upload_file():
file = request.files['file']
if file:
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(file_path)
reader = PdfReader(file)
for i in range(len(reader.pages)):
page = reader.pages[i]
print(page.extract_text())
return jsonify({'message': 'File uploaded successfully', 'file_path': file_path})
return jsonify({'error': 'Invalid file type. Only PDFs are allowed.'})


@app.route('/submit', methods=['POST'])
def submit_data():

data = request.json
words = re.findall(r'\b\w+\b', data.lower())
word_counts = Counter(words)
# print(f'Word count: {word_counts}')
email = extracted_email(data)
phoneNumber = extracted_phoneNumber(data)
print(f'Data: {data}, Email: {email}, Phone: {phoneNumber}')
#words = re.findall(r'\b\w+\b', data.lower())
#word_counts = Counter(words)
if isinstance(data, list):
datastr = ' '.join(data)
email = extracted_email(datastr)
phoneNumber = extracted_phoneNumber(datastr)
'''
name = extracted_name(datastr)
education = extracted_education(data)
work = extracted_wrkexp(data)
summary = extracted_summary(data)
#print(f'Data: {data}: , Email:{email} , Phone:{phoneNumber} , Name: {name}, Education: {education}, Summary : {summary}, Work Experience : {work}')
'''
return jsonify({"received_data": data, "email": email, "phone": phoneNumber})


def extracted_email(data):
email = re.findall("[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+@[a-z]+.{4}", data.lower())
if email:
return email[0]
return None


def extracted_phoneNumber(data):
phone = re.findall(r'\d{3}\W?\d{3}\W?\d{4}', data.lower())
if phone:
return phone[0]
return None

@app.route("/generate", methods=["GET"])
def generate_pdf():
Expand Down
76 changes: 76 additions & 0 deletions extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

import re



def extracted_email(data):
email = re.findall("[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+@[a-z]+.{4}", data.lower())
if email:
return email[0]
return None


def extracted_phoneNumber(data):
phone = re.findall(r'\d{3}\W?\d{3}\W?\d{4}', data.lower())
if phone:
return phone[0]
return None


def extracted_name(data):
ext_name = []
pattern = r'\b[A-Z][a-z]*\b|\b[A-Z]+\b'
matches = list(re.finditer(pattern, data))
if matches:
for i in range(2):
ext_name.append(matches[i].group())
return ext_name
return None


def extracted_education(data):
# logic of this function, collect 10 elements after the word education appears
cleaned_data = [item.strip().lower() for item in data]
ext_education = []
starting_i = -1
for i in range(len(cleaned_data) - 1):
if cleaned_data[i] == 'education':
starting_i = i
if starting_i != -1:
for j in range(starting_i, (starting_i + 20), 1):
ext_education.append(cleaned_data[j])
return ext_education
return None


def extracted_wrkexp(data):
# logic of this function, collect 10 elements after the word work experince appears
cleaned_data = [item.strip().lower() for item in data]
ext_work = []
starting_i = -1
for i in range(len(cleaned_data) - 1):
if ((cleaned_data[i] == 'work' and cleaned_data[i + 1] == 'experience') or
cleaned_data[i] == 'employment' or cleaned_data[i] == "experience"):
starting_i = i
if starting_i != -1:
for j in range(starting_i, (starting_i + 10), 1):
ext_work.append(cleaned_data[j])
return ext_work
return None


def extracted_summary(data):
# logic of this function, collect 10 elements after summary or professional summary appears
ext_sum = []
starting_i = -1
for i in range(len(data) - 1):
if data[i].strip().lower() == 'professional summary':
starting_i = i

if starting_i != -1:
for j in range(starting_i + 1, (starting_i + 10), 1):
ext_sum.append(data[j])
return ext_sum
else:
return None

177 changes: 177 additions & 0 deletions numOfIslands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@


grid = [
["1", "1", "1", "1"],
["1", "1", "0", "0"],
["0", "0", "0", "1"],
["0", "0", "1", "1"]
]






def enclaves(grid):
enclave = 0
for r in range(len(grid)):
for c in range(len(grid[r])):
if grid[r][c]= "1"





'''

def numOfIslands(grid):
islands = 0
maxislandsize = 0
for r in range(len(grid)):
for c in range(len(grid[r])):
if grid[r][c] == "1":
islandsize = dfs(grid,r,c)
islands += 1
if islandsize > maxislandsize:
maxislandsize = islandsize
return islands, maxislandsize


def dfs(grid,row,col) -> int:

if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0":
return 0

size = 1
grid[row][col] = "0"
size +=dfs(grid, row - 1, col)
size +=dfs(grid,row+1, col)
size +=dfs(grid,row,col+1)
size +=dfs(grid,row,col-1)
return size


print(numOfIslands(grid))



'''









































































def numOfIslands(grid):
island = 0
for r in range(len(grid)):
for c in range(len(grid[r])):
if grid[r][c] == "1":
bfs(grid, r, c)
island += 1
return island


def dfs(grid, row, col):
if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0":
return
grid[row][col] = "0"
#transverse_stack = [(row-1, col),(row+1,col),(row,col+1),(row,col-1)] used recursive as oppose to iterative approach
dfs(grid, row-1, col)
dfs(grid, row, col-1)
dfs(grid,row+1,col)
dfs(grid,row, col+1)

'''
def bfs(grid, row,col):
que = deque([(row, col)])
while que:
r,c = popleft()

if row > 0 and col > 0 and row <= len(grid) and col <= len(grid[0]) and grid[row][col] == "0":
continue

grid[r][c] = "0"

bfs.append((row, col + 1))
bfs.append(grid, row, col - 1)
bfs.append(grid, row + 1, col)
bfs.append(grid, row - 1, col)



'''




#print(numOfIslands(grid))

'''
35 changes: 35 additions & 0 deletions numOfIslandsDfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

grid = [
["1", "1", "1", "1"],
["1", "1", "0", "0"],
["0", "0", "0", "1"],
["0", "0", "1", "1"]
]



def numOfIslands(grid):
island = 0
disjointed_sets = []
for r in range(len(grid)):
for c in range(len(grid[r])):
if grid[r][c] != '0':
x = set()
dfs(grid, r, c, x)
island += 1
disjointed_sets.append(x)
return disjointed_sets


def dfs(grid, row, col, seen):
if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0":
return
seen.add((row,col))
grid[row][col] = "0"
dfs(grid, row-1, col, seen)
dfs(grid, row, col-1, seen)
dfs(grid,row+1,col, seen)
dfs(grid,row, col+1, seen)



Loading