diff --git a/app.py b/app.py index 45fc505..78cb723 100644 --- a/app.py +++ b/app.py @@ -2,6 +2,9 @@ from collections import Counter import re import json +from extract import extracted_email, extracted_phoneNumber, extracted_name, extracted_education, extracted_wrkexp, extracted_summary +import os +from pypdf import PdfReader app = Flask(__name__) @@ -16,30 +19,44 @@ def editor(): return render_template('editor.html') -@app.route('/submit', methods=['Post']) +UPLOAD_FOLDER = 'uploads' +os.makedirs(UPLOAD_FOLDER, exist_ok=True) +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + +@app.route('/upload', methods=['POST']) +def upload_file(): + file = request.files['file'] + if file: + file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) + file.save(file_path) + reader = PdfReader(file) + for i in range(len(reader.pages)): + page = reader.pages[i] + print(page.extract_text()) + return jsonify({'message': 'File uploaded successfully', 'file_path': file_path}) + return jsonify({'error': 'Invalid file type. Only PDFs are allowed.'}) + + +@app.route('/submit', methods=['POST']) def submit_data(): + data = request.json - words = re.findall(r'\b\w+\b', data.lower()) - word_counts = Counter(words) - # print(f'Word count: {word_counts}') - email = extracted_email(data) - phoneNumber = extracted_phoneNumber(data) - print(f'Data: {data}, Email: {email}, Phone: {phoneNumber}') + #words = re.findall(r'\b\w+\b', data.lower()) + #word_counts = Counter(words) + if isinstance(data, list): + datastr = ' '.join(data) + email = extracted_email(datastr) + phoneNumber = extracted_phoneNumber(datastr) + ''' + name = extracted_name(datastr) + education = extracted_education(data) + work = extracted_wrkexp(data) + summary = extracted_summary(data) + #print(f'Data: {data}: , Email:{email} , Phone:{phoneNumber} , Name: {name}, Education: {education}, Summary : {summary}, Work Experience : {work}') + ''' return jsonify({"received_data": data, "email": email, "phone": phoneNumber}) -def extracted_email(data): - email = re.findall("[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+@[a-z]+.{4}", data.lower()) - if email: - return email[0] - return None - - -def extracted_phoneNumber(data): - phone = re.findall(r'\d{3}\W?\d{3}\W?\d{4}', data.lower()) - if phone: - return phone[0] - return None @app.route("/generate", methods=["GET"]) def generate_pdf(): diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..2861169 --- /dev/null +++ b/extract.py @@ -0,0 +1,76 @@ + +import re + + + +def extracted_email(data): + email = re.findall("[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+@[a-z]+.{4}", data.lower()) + if email: + return email[0] + return None + + +def extracted_phoneNumber(data): + phone = re.findall(r'\d{3}\W?\d{3}\W?\d{4}', data.lower()) + if phone: + return phone[0] + return None + + +def extracted_name(data): + ext_name = [] + pattern = r'\b[A-Z][a-z]*\b|\b[A-Z]+\b' + matches = list(re.finditer(pattern, data)) + if matches: + for i in range(2): + ext_name.append(matches[i].group()) + return ext_name + return None + + +def extracted_education(data): + # logic of this function, collect 10 elements after the word education appears + cleaned_data = [item.strip().lower() for item in data] + ext_education = [] + starting_i = -1 + for i in range(len(cleaned_data) - 1): + if cleaned_data[i] == 'education': + starting_i = i + if starting_i != -1: + for j in range(starting_i, (starting_i + 20), 1): + ext_education.append(cleaned_data[j]) + return ext_education + return None + + +def extracted_wrkexp(data): + # logic of this function, collect 10 elements after the word work experince appears + cleaned_data = [item.strip().lower() for item in data] + ext_work = [] + starting_i = -1 + for i in range(len(cleaned_data) - 1): + if ((cleaned_data[i] == 'work' and cleaned_data[i + 1] == 'experience') or + cleaned_data[i] == 'employment' or cleaned_data[i] == "experience"): + starting_i = i + if starting_i != -1: + for j in range(starting_i, (starting_i + 10), 1): + ext_work.append(cleaned_data[j]) + return ext_work + return None + + +def extracted_summary(data): + # logic of this function, collect 10 elements after summary or professional summary appears + ext_sum = [] + starting_i = -1 + for i in range(len(data) - 1): + if data[i].strip().lower() == 'professional summary': + starting_i = i + + if starting_i != -1: + for j in range(starting_i + 1, (starting_i + 10), 1): + ext_sum.append(data[j]) + return ext_sum + else: + return None + diff --git a/numOfIslands.py b/numOfIslands.py new file mode 100644 index 0000000..7bdca3d --- /dev/null +++ b/numOfIslands.py @@ -0,0 +1,177 @@ + + +grid = [ + ["1", "1", "1", "1"], + ["1", "1", "0", "0"], + ["0", "0", "0", "1"], + ["0", "0", "1", "1"] +] + + + + + + +def enclaves(grid): + enclave = 0 + for r in range(len(grid)): + for c in range(len(grid[r])): + if grid[r][c]= "1" + + + + + +''' + +def numOfIslands(grid): + islands = 0 + maxislandsize = 0 + for r in range(len(grid)): + for c in range(len(grid[r])): + if grid[r][c] == "1": + islandsize = dfs(grid,r,c) + islands += 1 + if islandsize > maxislandsize: + maxislandsize = islandsize + return islands, maxislandsize + + +def dfs(grid,row,col) -> int: + + if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0": + return 0 + + size = 1 + grid[row][col] = "0" + size +=dfs(grid, row - 1, col) + size +=dfs(grid,row+1, col) + size +=dfs(grid,row,col+1) + size +=dfs(grid,row,col-1) + return size + + +print(numOfIslands(grid)) + + + +''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def numOfIslands(grid): + island = 0 + for r in range(len(grid)): + for c in range(len(grid[r])): + if grid[r][c] == "1": + bfs(grid, r, c) + island += 1 + return island + + +def dfs(grid, row, col): + if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0": + return + grid[row][col] = "0" + #transverse_stack = [(row-1, col),(row+1,col),(row,col+1),(row,col-1)] used recursive as oppose to iterative approach + dfs(grid, row-1, col) + dfs(grid, row, col-1) + dfs(grid,row+1,col) + dfs(grid,row, col+1) + +''' +def bfs(grid, row,col): + que = deque([(row, col)]) + while que: + r,c = popleft() + + if row > 0 and col > 0 and row <= len(grid) and col <= len(grid[0]) and grid[row][col] == "0": + continue + + grid[r][c] = "0" + + bfs.append((row, col + 1)) + bfs.append(grid, row, col - 1) + bfs.append(grid, row + 1, col) + bfs.append(grid, row - 1, col) + + + +''' + + + + +#print(numOfIslands(grid)) + +''' \ No newline at end of file diff --git a/numOfIslandsDfs.py b/numOfIslandsDfs.py new file mode 100644 index 0000000..15d69e3 --- /dev/null +++ b/numOfIslandsDfs.py @@ -0,0 +1,35 @@ + +grid = [ + ["1", "1", "1", "1"], + ["1", "1", "0", "0"], + ["0", "0", "0", "1"], + ["0", "0", "1", "1"] +] + + + +def numOfIslands(grid): + island = 0 + disjointed_sets = [] + for r in range(len(grid)): + for c in range(len(grid[r])): + if grid[r][c] != '0': + x = set() + dfs(grid, r, c, x) + island += 1 + disjointed_sets.append(x) + return disjointed_sets + + +def dfs(grid, row, col, seen): + if row < 0 or col < 0 or row >= len(grid) or col >= len(grid[row]) or grid[row][col] == "0": + return + seen.add((row,col)) + grid[row][col] = "0" + dfs(grid, row-1, col, seen) + dfs(grid, row, col-1, seen) + dfs(grid,row+1,col, seen) + dfs(grid,row, col+1, seen) + + + diff --git a/static/script.js b/static/script.js index ed3c869..23b1f0c 100644 --- a/static/script.js +++ b/static/script.js @@ -1,5 +1,6 @@ const fileInput = document.getElementById("myFile"); +fileInput.addEventListener("change", uploadFile); fileInput.addEventListener("change", handleFiles); function handleFiles(event) { @@ -13,7 +14,7 @@ function handleFiles(event) { function(pdf) { let textArray = []; let promises = []; - for (let i = 1; i <= pdf.numPages; i++) { + for (let i = 0; i < pdf.numPages; i++) { promises.push(extractText(pdf, i, textArray)); } Promise.all(promises).then(() => { @@ -21,6 +22,7 @@ function handleFiles(event) { }); }); }; + fileReader.readAsArrayBuffer(file); } @@ -38,3 +40,16 @@ function extractText(pdf, pageNumber, textArray) { console.error("Error reading file", error); }); } + +function uploadFile(event){ + const file = event.target.files[0]; + const formData = new FormData(); + formData.append('file', file); + + fetch('/upload',{ + method:'POST', + body: formData, + }).catch((error)=>{ + console.error("Error:", error) + }); +} \ No newline at end of file diff --git a/structured_json.py b/structured_json.py new file mode 100644 index 0000000..88df5ac --- /dev/null +++ b/structured_json.py @@ -0,0 +1,157 @@ +import json +from par import * +data = { + "version": "0.0.1", + "format": "orf", + "meta": { + "name": "John Doe", + "sort": "chronological", + "sort_order": "asc", + "tags": [ + "tech", + "software_development", + "backend" + ] + }, + "data": { + "personal": { + "name": "John Doe", + "phone_number": "8009001000", + "country_code": "+1", + "address_line_1": "null", + "address_line_2": "null", + "city": "Raleigh", + "state": "NC", + "zip": "null", + "country": "us", + "email": "john.doe@email.com", + "url_linkedin": "null", + "url_portfolio": "null", + "url_website": "null", + "url_other": [ + { + "text": "Facebook", + "url": "facebook.com/user" + }, + { + "text": "Dribble", + "url": "dribble.com/user" + } + ] + }, + "summary": "to obtain employment in XYZ field.", + "education": [ + { + "degree_level": "bachelors", + "degree_title": "B.A.", + "completed": "true", + "institution": "Earlham College", + "major": "Computer Science", + "minor": "null", + "concentration": "null", + "gpa": "3.99", + "gpa_scale": "4.00", + "institution_city": "Richmond", + "institution_state": "IN", + "institution_zip": "47374", + "institution_country": "us", + "start_day": 14, + "start_month": 9, + "start_year": 2014, + "end_day": 6, + "end_month": 5, + "end_year": 2018 + }, + { + "degree_level": "masters", + "degree_title": "M.S.", + "completed": "true", + "institution": "North Carolina State University", + "major": "Computer Engineering", + "minor": "null", + "concentration": "Computer Architecture & Systems", + "gpa": "3.99", + "gpa_scale": "4.00", + "institution_city": "Raleigh", + "institution_state": "NC", + "institution_zip": "27607", + "institution_country": "us", + "start_day": 14, + "start_month": 9, + "start_year": 2021, + "end_day": 6, + "end_month": 5, + "end_year": 2023 + } + ], + "experience": [ + { + "title": "Software Developer", + "employer": "XYZ Solutions, Inc", + "employer_city": "Lake Villa", + "employer_state": "IL", + "employer_zip": "60046", + "employer_country": "us", + "employer_phone_number": "null", + "description": [ + "Led a major REST API from development to production. Scheduled tasks and sprint goals based on team needs following Agile methods.", + "Replaced 200+ database tables with a new authorization service, achieving greater developer efficiency, lower maintenance cost, and fewer security incidents", + "Reduced response times by 95% (backend: 5s - .5s, frontend: 2m - 0.3s) after detailed performance analysis. Planned and led the execution of an optimization sprint.", + "Reduced client onboarding time from days to hours after migrating to reproducible, version-controlled Infrastructure-as-code solutions.", + "Enabled faster error detection, bugfixes, and reduced downtime through detailed exception tracking, stack trace analysis, and infrastructure health monitoring." + ] + } + ], + "skill": [ + { + "name": "c", + "proficiency_level": "beginner", + "catogory": "programmaing language" + }, + { + "name": "c++", + "proficiency_level": "intermediate", + "category": "programming language" + } + ], + "publication": [ + { + "title": "a groundbreaking research", + "url": "https://nature.com/next-einstein", + "year": 2018, + "credits": "Doe, J., et al", + "citation_format": "apa" + }, + { + "title": "Yet Another groundbreaking research", + "url": "https://nature.com/next-einstein", + "year": 2021, + "credits": "Doe, J., et al", + "citation_format": "apa" + } + ], + "projects": [ + { + "title": "Operating System Kernel Functionalities in XINU Microkernel", + "sub_title": "null", + "url": "https://github.com/salekinsirajus/realxinu", + "description": [ + "Implemented scheduling algorithms (lottery, MLFQ), Locks (Spin, Active, Priority Inversion), and fork() system call.", + "Used C and Assembly" + ] + }, + { + "title": "Standard for Machine-Readable Resume", + "sub_title": "null", + "url": "https://github.com/TheOpenResumeProject/OpenResume", + "description": [ + "Came up with a standard so that the data can be separated from formatting", + "Built a proof-of-concept using Flask, Python, and JavaScript" + ] + } + ] + } +} + +print(data['data']['education'][0]['degree_level']) +#print(data['meta']["name"]) \ No newline at end of file diff --git a/templates/editor.html b/templates/editor.html index 126470d..5aabc30 100644 --- a/templates/editor.html +++ b/templates/editor.html @@ -12,12 +12,13 @@

File Information

File name will be displayed here

-

Extracted Information

Email extracted:

Phone Number extracted:

+ +