diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..aa15e3e --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +app.py \ No newline at end of file diff --git a/.idea/Env1.iml b/.idea/Env1.iml new file mode 100644 index 0000000..e110c18 --- /dev/null +++ b/.idea/Env1.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..054a04d --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,28 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..e2445a2 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..cd30fdd --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/app.py b/app.py index 59194f7..9b731ae 100644 --- a/app.py +++ b/app.py @@ -15,13 +15,11 @@ def editor(): return render_template('editor.html') - @app.route('/submit', methods=['Post']) def submit_data(): data = request.json words = re.findall(r'\b\w+\b', data.lower()) word_counts = Counter(words) - # print(f'Word count: {word_counts}') email = extracted_email(data) phoneNumber = extracted_phoneNumber(data) print(f'Data: {data}, Email: {email}, Phone: {phoneNumber}') @@ -44,5 +42,6 @@ def extracted_phoneNumber(data): +main if __name__ == '__main__': app.run(debug=True) \ No newline at end of file diff --git a/pdfReader.py b/pdfReader.py new file mode 100644 index 0000000..f2b40c8 --- /dev/null +++ b/pdfReader.py @@ -0,0 +1,78 @@ +from pypdf import PdfReader +import re + + +def cleaner(list_array, item): + res = [i for i in list_array if i != item] + return res + + +def pefReader(data): + + reader = PdfReader(data) + content = reader.pages[0].extract_text() + reader.pages[1].extract_text() + content_lower = content.lower() + education_array = [] + work_array = [] + summary_array = [] + personal_data = [] + content_array = content_lower.split(' ') + elements = ' '.join(content_array) + pattern_2 = r' {2,}' + results = re.sub(pattern_2, ' ', elements) + result_arr = results.split(' ') + result_arr.remove('') + new_arr = cleaner(result_arr, '') + try: + end_of_summary = new_arr.index('\neducation:') + end_of_edu = new_arr.index('\nwork') + end_of_work = new_arr.index('\nleadership') + end_of_publications = new_arr.index('') + end_of_bio = new_arr.index('\nobjective') + + except ValueError: + print('Value does not exist') + try: + for i, x in enumerate(new_arr): + if x.strip() == "education:": # stripe function takes care of the \n that is present with education just like it is with every new line. also regex may be better # since characters like : will cause a probelm with indiviusal word search. + for i in range(i, end_of_edu): + education_array.append(new_arr[i]) + except IndexError: + print('Educational experience not found') + + try: + for a, b in enumerate(new_arr): + if b.strip() == "work" and new_arr[a + 1].strip() == "experience:": + for a in range(a, end_of_work): + work_array.append(new_arr[a]) + except IndexError: + print('Work experience not found') + + try: + for c, d in enumerate(new_arr): + if d.strip() == "objective" or d.strip() == "summary": + for c in range(c, end_of_summary): + summary_array.append(new_arr[c]) + except IndexError: + print('Professional summary not found') + + try: + for e, f in enumerate(new_arr): + if f.strip() == "publications" or d.strip() == "projects": + for c in range(c, end_of_publications): + summary_array.append(new_arr[c]) + except IndexError: + print('Publications not found') + + try: + for g, h in enumerate(new_arr[0:15]): + personal_data.append(h) + except IndexError: + print('Publications not found') + + #print(new_arr) + print(f'\n Personal Data: {personal_data} \n\n Candidate Objective: {summary_array} \n\n ' + f'Educational experince : {education_array} \n\n Work Experience : {work_array}') + + +pefReader('MMujtaba-CV.pdf')