Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Job Index Research - Germany.pdf
Binary file not shown.
3,389 changes: 3,389 additions & 0 deletions aggregate_job_postings_DE.csv

Large diffs are not rendered by default.

284 changes: 284 additions & 0 deletions data_cleaning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New CSV file created: job_postings_by_sector_DE_aggregated.csv\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Load the CSV file\n",
"input_file = 'job_postings_by_sector_DE.csv' # Replace with your file path\n",
"df = pd.read_csv(input_file)\n",
"\n",
"# Convert the date column to datetime if it's not already in that format\n",
"df['date'] = pd.to_datetime(df['date'])\n",
"\n",
"# Create a new column for the monthly data (only keep year and month)\n",
"df['Month'] = df['date'].dt.to_period('M')\n",
"\n",
"# Group by display_name and Month, and calculate the average of indeed_job_postings_index\n",
"monthly_aggregated = df.groupby(['display_name', 'Month']).agg({\n",
" 'indeed_job_postings_index': 'mean'\n",
"}).reset_index()\n",
"\n",
"# Convert the 'Month' column back to datetime, keeping the first day of each month\n",
"monthly_aggregated['date'] = monthly_aggregated['Month'].dt.to_timestamp()\n",
"\n",
"# Add a constant column for jobcountry as 'DE'\n",
"monthly_aggregated['jobcountry'] = 'DE'\n",
"\n",
"# Rearrange columns and rename them as required\n",
"output_df = monthly_aggregated[['date', 'jobcountry', 'indeed_job_postings_index', 'display_name']]\n",
"\n",
"# Save the new DataFrame to a CSV file\n",
"output_file = 'job_postings_by_sector_DE_aggregated.csv' # Output path\n",
"output_df.to_csv(output_file, index=False)\n",
"\n",
"print(f\"New CSV file created: {output_file}\")\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Data file: https://www.kaggle.com/datasets/kimminh21/job-postings/data \n",
"\n",
"indeed_job_postings_index: In the file you provided, the indeed_job_postings_index parameter seems to represent a normalized or indexed value indicating the number of job postings from Indeed for a specific sector. The value is indexed over time, with each row representing data for a given month and sector. An index value around 100 suggests a baseline or reference point, while values above or below this number suggest relative increases or decreases in job postings compared to that baseline.\n",
"\n",
"For example: A value of 100.88 in February 2020 for \"Accounting\" suggests a level slightly above the baseline for job postings in that sector.\n",
"\n",
"Step 1: Aggregating daily data to monthly data \n",
"Step 2: job_posting.db created / not important as the CSV file will be added \n",
"Step 3: \n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2296 entries, 0 to 2295\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 date 2296 non-null object \n",
" 1 jobcountry 2296 non-null object \n",
" 2 indeed_job_postings_index 2296 non-null float64\n",
" 3 display_name 2296 non-null object \n",
"dtypes: float64(1), object(3)\n",
"memory usage: 71.9+ KB\n",
"None\n",
" indeed_job_postings_index\n",
"count 2296.000000\n",
"mean 142.647285\n",
"std 46.061219\n",
"min 37.538710\n",
"25% 104.185968\n",
"50% 142.528226\n",
"75% 172.611452\n",
"max 335.344667\n",
"date 0\n",
"jobcountry 0\n",
"indeed_job_postings_index 0\n",
"display_name 0\n",
"dtype: int64\n",
" date jobcountry indeed_job_postings_index display_name\n",
"0 2020-02-01 DE 100.881379 Accounting\n",
"1 2020-03-01 DE 97.552258 Accounting\n",
"2 2020-04-01 DE 82.365000 Accounting\n",
"3 2020-05-01 DE 78.710645 Accounting\n",
"4 2020-06-01 DE 79.868667 Accounting\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Load the dataset\n",
"file_path = 'job_postings_by_sector_DE_aggregated.csv'\n",
"df = pd.read_csv(file_path)\n",
"\n",
"# Display basic information about the dataset\n",
"print(df.info())\n",
"print(df.describe())\n",
"\n",
"# Check for missing values\n",
"print(df.isnull().sum())\n",
"\n",
"# Show the first few rows of the dataset\n",
"print(df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" date jobcountry indeed_job_postings_index display_name\n",
"0 2020-02-01 DE 100.881379 Accounting\n",
"1 2020-03-01 DE 97.552258 Accounting\n",
"2 2020-04-01 DE 82.365000 Accounting\n",
"3 2020-05-01 DE 78.710645 Accounting\n",
"4 2020-06-01 DE 79.868667 Accounting\n"
]
}
],
"source": [
"\n",
"# creating a DB File to be imported \n",
"import sqlite3\n",
"\n",
"# Connect to SQLite (or any other SQL engine you are using)\n",
"conn = sqlite3.connect('job_postings.db')\n",
"\n",
"# Load the dataset into SQL (this will create a table in the database)\n",
"df.to_sql('job_postings', conn, if_exists='replace', index=False)\n",
"\n",
"# Confirm the table has been created and data inserted\n",
"query = \"SELECT * FROM job_postings LIMIT 5;\"\n",
"print(pd.read_sql(query, conn))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2020-02 MoM 2020-03 MoM 2020-04 MoM \\\n",
"display_name \n",
"IT Operations & Helpdesk NaN -2.329190 -14.790635 \n",
"Information Design & Documentation NaN -3.132472 -17.249557 \n",
"Software Development NaN -3.927253 -14.060045 \n",
"\n",
" 2020-05 MoM 2020-06 MoM 2020-07 MoM \\\n",
"display_name \n",
"IT Operations & Helpdesk -1.554613 1.366314 2.055617 \n",
"Information Design & Documentation -7.670945 0.828042 2.456450 \n",
"Software Development -3.786816 2.412760 2.380499 \n",
"\n",
" 2020-08 MoM 2020-09 MoM 2020-10 MoM \\\n",
"display_name \n",
"IT Operations & Helpdesk 3.997910 -2.036803 2.585348 \n",
"Information Design & Documentation 1.493875 -0.836582 3.734547 \n",
"Software Development 3.128727 -0.137640 3.294270 \n",
"\n",
" 2020-11 MoM ... 2023-12 MoM \\\n",
"display_name ... \n",
"IT Operations & Helpdesk 4.687251 ... -0.191140 \n",
"Information Design & Documentation 7.375325 ... -4.686921 \n",
"Software Development 3.571830 ... -2.504072 \n",
"\n",
" 2024-01 MoM 2024-02 MoM 2024-03 MoM \\\n",
"display_name \n",
"IT Operations & Helpdesk 1.179951 -6.543267 -5.490616 \n",
"Information Design & Documentation -0.930707 -5.259326 -7.330200 \n",
"Software Development -1.178428 -6.573449 -7.139246 \n",
"\n",
" 2024-04 MoM 2024-05 MoM 2024-06 MoM \\\n",
"display_name \n",
"IT Operations & Helpdesk -4.599388 0.681337 -1.989327 \n",
"Information Design & Documentation -4.946442 -4.876746 -2.345677 \n",
"Software Development -4.977985 -2.926107 -3.888864 \n",
"\n",
" 2024-07 MoM 2024-08 MoM 2024-09 MoM \n",
"display_name \n",
"IT Operations & Helpdesk -2.634112 -0.764173 -5.143268 \n",
"Information Design & Documentation -1.996063 -0.650301 -4.062791 \n",
"Software Development -3.032597 -1.410654 -4.304688 \n",
"\n",
"[3 rows x 56 columns]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/0j/h4x8twq57394t3xmh53ddyv40000gn/T/ipykernel_8806/2134418910.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" filtered_data['date'] = pd.to_datetime(filtered_data['date'])\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Step 1: Load the data from the CSV file\n",
"data = pd.read_csv('job_postings_by_sector_DE_aggregated.csv')\n",
"\n",
"# Step 2: Filter the data for IT-related sectors\n",
"it_sectors = ['IT Operations & Helpdesk', 'Information Design & Documentation', 'Software Development']\n",
"filtered_data = data[data['display_name'].isin(it_sectors)]\n",
"\n",
"# Step 3: Convert the 'date' column to datetime format\n",
"filtered_data['date'] = pd.to_datetime(filtered_data['date'])\n",
"\n",
"# Step 4: Sort the data by 'display_name' and 'date'\n",
"filtered_data = filtered_data.sort_values(by=['display_name', 'date'])\n",
"\n",
"# Step 5: Calculate the month-over-month percentage change\n",
"filtered_data['mom_percentage_change'] = filtered_data.groupby('display_name')['indeed_job_postings_index'].pct_change() * 100\n",
"\n",
"# Step 6: Pivot the table to display each month as a column\n",
"pivot_table = filtered_data.pivot(index='display_name', columns='date', values='mom_percentage_change')\n",
"\n",
"# Step 7: Rename columns to 'YYYY-MM MoM' format\n",
"pivot_table.columns = [f\"{col.strftime('%Y-%m')} MoM\" for col in pivot_table.columns]\n",
"\n",
"# Step 8: Save the result to a CSV file (optional) or display it\n",
"pivot_table.to_csv('it_sectors_mom_percentage_change.csv')\n",
"\n",
"# To display the result instead of saving:\n",
"print(pivot_table)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
4 changes: 4 additions & 0 deletions it_sectors_mom_percentage_change.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
display_name,2020-02 MoM,2020-03 MoM,2020-04 MoM,2020-05 MoM,2020-06 MoM,2020-07 MoM,2020-08 MoM,2020-09 MoM,2020-10 MoM,2020-11 MoM,2020-12 MoM,2021-01 MoM,2021-02 MoM,2021-03 MoM,2021-04 MoM,2021-05 MoM,2021-06 MoM,2021-07 MoM,2021-08 MoM,2021-09 MoM,2021-10 MoM,2021-11 MoM,2021-12 MoM,2022-01 MoM,2022-02 MoM,2022-03 MoM,2022-04 MoM,2022-05 MoM,2022-06 MoM,2022-07 MoM,2022-08 MoM,2022-09 MoM,2022-10 MoM,2022-11 MoM,2022-12 MoM,2023-01 MoM,2023-02 MoM,2023-03 MoM,2023-04 MoM,2023-05 MoM,2023-06 MoM,2023-07 MoM,2023-08 MoM,2023-09 MoM,2023-10 MoM,2023-11 MoM,2023-12 MoM,2024-01 MoM,2024-02 MoM,2024-03 MoM,2024-04 MoM,2024-05 MoM,2024-06 MoM,2024-07 MoM,2024-08 MoM,2024-09 MoM
IT Operations & Helpdesk,,-2.3291898287425394,-14.790635013108055,-1.554613475440414,1.3663142858270305,2.055617084141903,3.997909689428658,-2.0368027053692783,2.5853476995377545,4.68725098389875,4.059843337727997,4.105761467152247,-0.22004067735521549,2.7439552916949106,2.1247146029135244,6.918952990988858,5.4597707758356595,5.580694775381745,7.7933814093874565,1.821773292741291,3.6860982468136294,6.237841132705024,4.566084608555676,2.4479359198753325,-1.5520121685010269,1.821499065229415,1.0389002578059525,4.021019166618323,0.539642172798005,0.27479059466108957,1.8491060401878645,-3.506680180595234,-1.162100068561045,4.537507213668879,-1.3303100684590508,-2.379070176950371,-0.7777586493319166,-1.9743844596568638,-3.9439950545186675,-2.488442487390885,-0.9251177100842445,-0.3971338669034985,0.10928635158615752,-6.245698349274864,-0.4894780303438706,1.4721756890277549,-0.19113999315319985,1.1799509876927905,-6.543266928434887,-5.490616204581855,-4.599388271099936,0.6813371737198537,-1.9893268838970135,-2.6341122185931876,-0.7641725917131192,-5.143268384628829
Information Design & Documentation,,-3.1324719815493007,-17.24955653793756,-7.670944975300076,0.8280421663071191,2.4564497505784155,1.4938748594472528,-0.8365819797306662,3.734546723729726,7.375325427256119,3.2681897504815316,7.18161436614988,3.88549537702616,5.925688477295088,2.5114883279825806,4.198058314978215,4.911686790987235,6.502114549225868,9.473572258653572,2.5458918310526713,4.299912216508872,7.027783070213123,2.657035163743937,3.456696025293593,2.8012673069282457,1.4648327998608934,0.8531927695084596,0.05778664370765796,-1.1112613012971395,-2.5444154052423285,-0.2999235300672698,-3.7057714482098603,-1.358795010122249,3.731457848941422,-4.604704487155875,-3.569971398402849,-3.054291564368661,-2.2861678273443653,-5.2170504009420915,-6.530879076224649,-2.592918194155569,-1.6173855114149038,-0.6056712614141757,-3.9789662390582237,-2.722769671757397,2.239025983024745,-4.68692105608377,-0.9307068989196532,-5.259325544467308,-7.330200301136092,-4.946442109884098,-4.8767462078044606,-2.345676718735301,-1.9960629436909016,-0.6503010796939157,-4.062790643146874
Software Development,,-3.927253458572866,-14.060044855696274,-3.7868162692846985,2.4127597716364635,2.3804992546071713,3.12872744171635,-0.13763977083401935,3.2942699487363614,3.5718298852416064,3.8967505735408636,4.610424539312774,0.9550558942737375,4.1918483181558885,2.795473132098225,4.435315634341808,4.780142047081726,6.532543000776503,7.475139312213308,2.774865443804697,5.064399374293682,3.674628299906857,2.73805809053127,0.814656764868249,1.2530731066295653,1.146081180538494,1.9261856228280694,2.2705398363481955,-0.3909630886720139,-2.014219592277211,-0.7120995674844921,-2.586733797010321,-0.8564815886692778,0.23384860111184125,-2.2129559754374917,-4.412172786976221,-0.7458515453967474,-3.6843500549597064,-5.577528448326774,-5.506813417999501,-2.455854912749811,-1.5512380483792998,-1.54388112920808,-4.874707289757685,0.10606446184970864,-2.189291383982994,-2.5040720168968633,-1.1784279218946314,-6.573449422276378,-7.139246259354626,-4.977984678597302,-2.926106925270866,-3.888863572497292,-3.0325970699343063,-1.410654475221862,-4.304687629155712
Binary file added job_postings.db
Binary file not shown.
Loading