+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..e2445a2
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..cd30fdd
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
From 3b8a14f6541b350d4e88f54232f936a29dfeb4bd Mon Sep 17 00:00:00 2001
From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com>
Date: Sun, 6 Oct 2024 13:32:21 -0400
Subject: [PATCH 3/4] This inlcudes function for get educational experince out
of the resume
---
pdfReader.py | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/pdfReader.py b/pdfReader.py
index 0866b4f..5311eaf 100644
--- a/pdfReader.py
+++ b/pdfReader.py
@@ -1,8 +1,12 @@
from pypdf import PdfReader
import re
+def cleaner(list_array, item):
+ res = [i for i in list_array if i != item]
+ return res
-def pefReader(data):
+def pdfReader(data):
+ item =''
reader = PdfReader(data)
content = reader.pages[0].extract_text()
content_lower = content.lower()
@@ -12,12 +16,14 @@ def pefReader(data):
pattern_2 = r' {2,}'
results = re.sub(pattern_2, ' ', elements)
result_arr = results.split(' ')
- for i, x in enumerate(result_arr):
- if x == 'education':
- for j in range(i - 2, i + 120):
- education_array.append(content_array[j])
- education_exp = ' '.join(education_array)
- #print(content_array)
- print(education_exp)
+ result_arr.remove('')
+ new_arr = cleaner(result_arr, '')
+ print(new_arr[0])
+ y = new_arr.index('\nwork')
+ for i, x in enumerate(new_arr):
+ if x.strip() == "education:": #stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search.
+ for i in range(i, y):
+ education_array.append(new_arr[i])
-pefReader('MMujtaba-CV.pdf')
+ print(education_array)
+pdfReader('MMujtaba-CV.pdf')
From 58d2fe43adbdc87f24d5843cf6e3c1f014c4921c Mon Sep 17 00:00:00 2001
From: MujtabaMuhammad <34104990+MujtabaMuhammad@users.noreply.github.com>
Date: Mon, 14 Oct 2024 15:50:11 -0400
Subject: [PATCH 4/4] added function for extracting work experience, bio,
summary
---
pdfReader.py | 73 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 61 insertions(+), 12 deletions(-)
diff --git a/pdfReader.py b/pdfReader.py
index 5311eaf..f2b40c8 100644
--- a/pdfReader.py
+++ b/pdfReader.py
@@ -1,16 +1,21 @@
from pypdf import PdfReader
import re
+
def cleaner(list_array, item):
res = [i for i in list_array if i != item]
return res
-def pdfReader(data):
- item =''
+
+def pefReader(data):
+
reader = PdfReader(data)
- content = reader.pages[0].extract_text()
+ content = reader.pages[0].extract_text() + reader.pages[1].extract_text()
content_lower = content.lower()
education_array = []
+ work_array = []
+ summary_array = []
+ personal_data = []
content_array = content_lower.split(' ')
elements = ' '.join(content_array)
pattern_2 = r' {2,}'
@@ -18,12 +23,56 @@ def pdfReader(data):
result_arr = results.split(' ')
result_arr.remove('')
new_arr = cleaner(result_arr, '')
- print(new_arr[0])
- y = new_arr.index('\nwork')
- for i, x in enumerate(new_arr):
- if x.strip() == "education:": #stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search.
- for i in range(i, y):
- education_array.append(new_arr[i])
-
- print(education_array)
-pdfReader('MMujtaba-CV.pdf')
+ try:
+ end_of_summary = new_arr.index('\neducation:')
+ end_of_edu = new_arr.index('\nwork')
+ end_of_work = new_arr.index('\nleadership')
+ end_of_publications = new_arr.index('')
+ end_of_bio = new_arr.index('\nobjective')
+
+ except ValueError:
+ print('Value does not exist')
+ try:
+ for i, x in enumerate(new_arr):
+ if x.strip() == "education:": # stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search.
+ for i in range(i, end_of_edu):
+ education_array.append(new_arr[i])
+ except IndexError:
+ print('Educational experience not found')
+
+ try:
+ for a, b in enumerate(new_arr):
+ if b.strip() == "work" and new_arr[a + 1].strip() == "experience:":
+ for a in range(a, end_of_work):
+ work_array.append(new_arr[a])
+ except IndexError:
+ print('Work experience not found')
+
+ try:
+ for c, d in enumerate(new_arr):
+ if d.strip() == "objective" or d.strip() == "summary":
+ for c in range(c, end_of_summary):
+ summary_array.append(new_arr[c])
+ except IndexError:
+ print('Professional summary not found')
+
+ try:
+ for e, f in enumerate(new_arr):
+ if f.strip() == "publications" or d.strip() == "projects":
+ for c in range(c, end_of_publications):
+ summary_array.append(new_arr[c])
+ except IndexError:
+ print('Publications not found')
+
+ try:
+ for g, h in enumerate(new_arr[0:15]):
+ personal_data.append(h)
+ except IndexError:
+ print('Publications not found')
+
+ #print(new_arr)
+ print(f'\n Personal Data: {personal_data} \n\n Candidate Objective: {summary_array} \n\n '
+ f'Educational experince : {education_array} \n\n Work Experience : {work_array}')
+
+
+pefReader('MMujtaba-CV.pdf')