From 2f55780edb4e6d3039eb3ea4e0190ed435978aaa Mon Sep 17 00:00:00 2001 From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:14:10 -0400 Subject: [PATCH 1/4] app.py file was cleaned up --- app.py | 5 +++-- pdfReader.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 pdfReader.py diff --git a/app.py b/app.py index 46939d4..9b26037 100644 --- a/app.py +++ b/app.py @@ -15,13 +15,11 @@ def editor(): return render_template('editor.html') - @app.route('/submit', methods=['Post']) def submit_data(): data = request.json words = re.findall(r'\b\w+\b', data.lower()) word_counts = Counter(words) - # print(f'Word count: {word_counts}') email = extracted_email(data) phoneNumber = extracted_phoneNumber(data) print(f'Data: {data}, Email: {email}, Phone: {phoneNumber}') @@ -43,5 +41,8 @@ def extracted_phoneNumber(data): + + + if __name__ == '__main__': app.run(debug=True) diff --git a/pdfReader.py b/pdfReader.py new file mode 100644 index 0000000..0866b4f --- /dev/null +++ b/pdfReader.py @@ -0,0 +1,23 @@ +from pypdf import PdfReader +import re + + +def pefReader(data): + reader = PdfReader(data) + content = reader.pages[0].extract_text() + content_lower = content.lower() + education_array = [] + content_array = content_lower.split(' ') + elements = ' '.join(content_array) + pattern_2 = r' {2,}' + results = re.sub(pattern_2, ' ', elements) + result_arr = results.split(' ') + for i, x in enumerate(result_arr): + if x == 'education': + for j in range(i - 2, i + 120): + education_array.append(content_array[j]) + education_exp = ' '.join(education_array) + #print(content_array) + print(education_exp) + +pefReader('MMujtaba-CV.pdf') From a33665344e718b1aa35e8d3a8d8779b2e5071a99 Mon Sep 17 00:00:00 2001 From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:16:22 -0400 Subject: [PATCH 2/4] branch for text analysis --- .idea/.gitignore | 3 ++ .idea/.name | 1 + .idea/Env1.iml | 17 +++++++++++ .idea/inspectionProfiles/Project_Default.xml | 28 +++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++ .idea/misc.xml | 10 +++++++ .idea/modules.xml | 8 ++++++ .idea/vcs.xml | 6 ++++ 8 files changed, 79 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/.name create mode 100644 .idea/Env1.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..aa15e3e --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +app.py \ No newline at end of file diff --git a/.idea/Env1.iml b/.idea/Env1.iml new file mode 100644 index 0000000..e110c18 --- /dev/null +++ b/.idea/Env1.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..054a04d --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,28 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..e2445a2 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..cd30fdd --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 3b8a14f6541b350d4e88f54232f936a29dfeb4bd Mon Sep 17 00:00:00 2001 From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com> Date: Sun, 6 Oct 2024 13:32:21 -0400 Subject: [PATCH 3/4] This inlcudes function for get educational experince out of the resume --- pdfReader.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pdfReader.py b/pdfReader.py index 0866b4f..5311eaf 100644 --- a/pdfReader.py +++ b/pdfReader.py @@ -1,8 +1,12 @@ from pypdf import PdfReader import re +def cleaner(list_array, item): + res = [i for i in list_array if i != item] + return res -def pefReader(data): +def pdfReader(data): + item ='' reader = PdfReader(data) content = reader.pages[0].extract_text() content_lower = content.lower() @@ -12,12 +16,14 @@ def pefReader(data): pattern_2 = r' {2,}' results = re.sub(pattern_2, ' ', elements) result_arr = results.split(' ') - for i, x in enumerate(result_arr): - if x == 'education': - for j in range(i - 2, i + 120): - education_array.append(content_array[j]) - education_exp = ' '.join(education_array) - #print(content_array) - print(education_exp) + result_arr.remove('') + new_arr = cleaner(result_arr, '') + print(new_arr[0]) + y = new_arr.index('\nwork') + for i, x in enumerate(new_arr): + if x.strip() == "education:": #stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search. + for i in range(i, y): + education_array.append(new_arr[i]) -pefReader('MMujtaba-CV.pdf') + print(education_array) +pdfReader('MMujtaba-CV.pdf') From 58d2fe43adbdc87f24d5843cf6e3c1f014c4921c Mon Sep 17 00:00:00 2001 From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:50:11 -0400 Subject: [PATCH 4/4] added function for extracting work experience, bio, summary --- pdfReader.py | 73 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/pdfReader.py b/pdfReader.py index 5311eaf..f2b40c8 100644 --- a/pdfReader.py +++ b/pdfReader.py @@ -1,16 +1,21 @@ from pypdf import PdfReader import re + def cleaner(list_array, item): res = [i for i in list_array if i != item] return res -def pdfReader(data): - item ='' + +def pefReader(data): + reader = PdfReader(data) - content = reader.pages[0].extract_text() + content = reader.pages[0].extract_text() + reader.pages[1].extract_text() content_lower = content.lower() education_array = [] + work_array = [] + summary_array = [] + personal_data = [] content_array = content_lower.split(' ') elements = ' '.join(content_array) pattern_2 = r' {2,}' @@ -18,12 +23,56 @@ def pdfReader(data): result_arr = results.split(' ') result_arr.remove('') new_arr = cleaner(result_arr, '') - print(new_arr[0]) - y = new_arr.index('\nwork') - for i, x in enumerate(new_arr): - if x.strip() == "education:": #stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search. - for i in range(i, y): - education_array.append(new_arr[i]) - - print(education_array) -pdfReader('MMujtaba-CV.pdf') + try: + end_of_summary = new_arr.index('\neducation:') + end_of_edu = new_arr.index('\nwork') + end_of_work = new_arr.index('\nleadership') + end_of_publications = new_arr.index('') + end_of_bio = new_arr.index('\nobjective') + + except ValueError: + print('Value does not exist') + try: + for i, x in enumerate(new_arr): + if x.strip() == "education:": # stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search. + for i in range(i, end_of_edu): + education_array.append(new_arr[i]) + except IndexError: + print('Educational experience not found') + + try: + for a, b in enumerate(new_arr): + if b.strip() == "work" and new_arr[a + 1].strip() == "experience:": + for a in range(a, end_of_work): + work_array.append(new_arr[a]) + except IndexError: + print('Work experience not found') + + try: + for c, d in enumerate(new_arr): + if d.strip() == "objective" or d.strip() == "summary": + for c in range(c, end_of_summary): + summary_array.append(new_arr[c]) + except IndexError: + print('Professional summary not found') + + try: + for e, f in enumerate(new_arr): + if f.strip() == "publications" or d.strip() == "projects": + for c in range(c, end_of_publications): + summary_array.append(new_arr[c]) + except IndexError: + print('Publications not found') + + try: + for g, h in enumerate(new_arr[0:15]): + personal_data.append(h) + except IndexError: + print('Publications not found') + + #print(new_arr) + print(f'\n Personal Data: {personal_data} \n\n Candidate Objective: {summary_array} \n\n ' + f'Educational experince : {education_array} \n\n Work Experience : {work_array}') + + +pefReader('MMujtaba-CV.pdf')