ironhack-labs · senlerk · Sep 27, 2024
diff --git a/Job Index Research - Germany.pdf b/Job Index Research - Germany.pdf
diff --git a/aggregate_job_postings_DE.csv b/aggregate_job_postings_DE.csv
diff --git a/data_cleaning.ipynb b/data_cleaning.ipynb
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New CSV file created: job_postings_by_sector_DE_aggregated.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the CSV file\n",
+    "input_file = 'job_postings_by_sector_DE.csv'  # Replace with your file path\n",
+    "df = pd.read_csv(input_file)\n",
+    "\n",
+    "# Convert the date column to datetime if it's not already in that format\n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
+    "\n",
+    "# Create a new column for the monthly data (only keep year and month)\n",
+    "df['Month'] = df['date'].dt.to_period('M')\n",
+    "\n",
+    "# Group by display_name and Month, and calculate the average of indeed_job_postings_index\n",
+    "monthly_aggregated = df.groupby(['display_name', 'Month']).agg({\n",
+    "    'indeed_job_postings_index': 'mean'\n",
+    "}).reset_index()\n",
+    "\n",
+    "# Convert the 'Month' column back to datetime, keeping the first day of each month\n",
+    "monthly_aggregated['date'] = monthly_aggregated['Month'].dt.to_timestamp()\n",
+    "\n",
+    "# Add a constant column for jobcountry as 'DE'\n",
+    "monthly_aggregated['jobcountry'] = 'DE'\n",
+    "\n",
+    "# Rearrange columns and rename them as required\n",
+    "output_df = monthly_aggregated[['date', 'jobcountry', 'indeed_job_postings_index', 'display_name']]\n",
+    "\n",
+    "# Save the new DataFrame to a CSV file\n",
+    "output_file = 'job_postings_by_sector_DE_aggregated.csv'  # Output path\n",
+    "output_df.to_csv(output_file, index=False)\n",
+    "\n",
+    "print(f\"New CSV file created: {output_file}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Data file: https://www.kaggle.com/datasets/kimminh21/job-postings/data \n",
+    "\n",
+    "indeed_job_postings_index: In the file you provided, the indeed_job_postings_index parameter seems to represent a normalized or indexed value indicating the number of job postings from Indeed for a specific sector. The value is indexed over time, with each row representing data for a given month and sector. An index value around 100 suggests a baseline or reference point, while values above or below this number suggest relative increases or decreases in job postings compared to that baseline.\n",
+    "\n",
+    "For example: A value of 100.88 in February 2020 for \"Accounting\" suggests a level slightly above the baseline for job postings in that sector.\n",
+    "\n",
+    "Step 1: Aggregating daily data to monthly data \n",
+    "Step 2: job_posting.db created / not important as the CSV file will be added \n",
+    "Step 3: \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 2296 entries, 0 to 2295\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column                     Non-Null Count  Dtype  \n",
+      "---  ------                     --------------  -----  \n",
+      " 0   date                       2296 non-null   object \n",
+      " 1   jobcountry                 2296 non-null   object \n",
+      " 2   indeed_job_postings_index  2296 non-null   float64\n",
+      " 3   display_name               2296 non-null   object \n",
+      "dtypes: float64(1), object(3)\n",
+      "memory usage: 71.9+ KB\n",
+      "None\n",
+      "       indeed_job_postings_index\n",
+      "count                2296.000000\n",
+      "mean                  142.647285\n",
+      "std                    46.061219\n",
+      "min                    37.538710\n",
+      "25%                   104.185968\n",
+      "50%                   142.528226\n",
+      "75%                   172.611452\n",
+      "max                   335.344667\n",
+      "date                         0\n",
+      "jobcountry                   0\n",
+      "indeed_job_postings_index    0\n",
+      "display_name                 0\n",
+      "dtype: int64\n",
+      "         date jobcountry  indeed_job_postings_index display_name\n",
+      "0  2020-02-01         DE                 100.881379   Accounting\n",
+      "1  2020-03-01         DE                  97.552258   Accounting\n",
+      "2  2020-04-01         DE                  82.365000   Accounting\n",
+      "3  2020-05-01         DE                  78.710645   Accounting\n",
+      "4  2020-06-01         DE                  79.868667   Accounting\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the dataset\n",
+    "file_path = 'job_postings_by_sector_DE_aggregated.csv'\n",
+    "df = pd.read_csv(file_path)\n",
+    "\n",
+    "# Display basic information about the dataset\n",
+    "print(df.info())\n",
+    "print(df.describe())\n",
+    "\n",
+    "# Check for missing values\n",
+    "print(df.isnull().sum())\n",
+    "\n",
+    "# Show the first few rows of the dataset\n",
+    "print(df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         date jobcountry  indeed_job_postings_index display_name\n",
+      "0  2020-02-01         DE                 100.881379   Accounting\n",
+      "1  2020-03-01         DE                  97.552258   Accounting\n",
+      "2  2020-04-01         DE                  82.365000   Accounting\n",
+      "3  2020-05-01         DE                  78.710645   Accounting\n",
+      "4  2020-06-01         DE                  79.868667   Accounting\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# creating a DB File to be imported \n",
+    "import sqlite3\n",
+    "\n",
+    "# Connect to SQLite (or any other SQL engine you are using)\n",
+    "conn = sqlite3.connect('job_postings.db')\n",
+    "\n",
+    "# Load the dataset into SQL (this will create a table in the database)\n",
+    "df.to_sql('job_postings', conn, if_exists='replace', index=False)\n",
+    "\n",
+    "# Confirm the table has been created and data inserted\n",
+    "query = \"SELECT * FROM job_postings LIMIT 5;\"\n",
+    "print(pd.read_sql(query, conn))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                    2020-02 MoM  2020-03 MoM  2020-04 MoM  \\\n",
+      "display_name                                                                \n",
+      "IT Operations & Helpdesk                    NaN    -2.329190   -14.790635   \n",
+      "Information Design & Documentation          NaN    -3.132472   -17.249557   \n",
+      "Software Development                        NaN    -3.927253   -14.060045   \n",
+      "\n",
+      "                                    2020-05 MoM  2020-06 MoM  2020-07 MoM  \\\n",
+      "display_name                                                                \n",
+      "IT Operations & Helpdesk              -1.554613     1.366314     2.055617   \n",
+      "Information Design & Documentation    -7.670945     0.828042     2.456450   \n",
+      "Software Development                  -3.786816     2.412760     2.380499   \n",
+      "\n",
+      "                                    2020-08 MoM  2020-09 MoM  2020-10 MoM  \\\n",
+      "display_name                                                                \n",
+      "IT Operations & Helpdesk               3.997910    -2.036803     2.585348   \n",
+      "Information Design & Documentation     1.493875    -0.836582     3.734547   \n",
+      "Software Development                   3.128727    -0.137640     3.294270   \n",
+      "\n",
+      "                                    2020-11 MoM  ...  2023-12 MoM  \\\n",
+      "display_name                                     ...                \n",
+      "IT Operations & Helpdesk               4.687251  ...    -0.191140   \n",
+      "Information Design & Documentation     7.375325  ...    -4.686921   \n",
+      "Software Development                   3.571830  ...    -2.504072   \n",
+      "\n",
+      "                                    2024-01 MoM  2024-02 MoM  2024-03 MoM  \\\n",
+      "display_name                                                                \n",
+      "IT Operations & Helpdesk               1.179951    -6.543267    -5.490616   \n",
+      "Information Design & Documentation    -0.930707    -5.259326    -7.330200   \n",
+      "Software Development                  -1.178428    -6.573449    -7.139246   \n",
+      "\n",
+      "                                    2024-04 MoM  2024-05 MoM  2024-06 MoM  \\\n",
+      "display_name                                                                \n",
+      "IT Operations & Helpdesk              -4.599388     0.681337    -1.989327   \n",
+      "Information Design & Documentation    -4.946442    -4.876746    -2.345677   \n",
+      "Software Development                  -4.977985    -2.926107    -3.888864   \n",
+      "\n",
+      "                                    2024-07 MoM  2024-08 MoM  2024-09 MoM  \n",
+      "display_name                                                               \n",
+      "IT Operations & Helpdesk              -2.634112    -0.764173    -5.143268  \n",
+      "Information Design & Documentation    -1.996063    -0.650301    -4.062791  \n",
+      "Software Development                  -3.032597    -1.410654    -4.304688  \n",
+      "\n",
+      "[3 rows x 56 columns]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/0j/h4x8twq57394t3xmh53ddyv40000gn/T/ipykernel_8806/2134418910.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  filtered_data['date'] = pd.to_datetime(filtered_data['date'])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Step 1: Load the data from the CSV file\n",
+    "data = pd.read_csv('job_postings_by_sector_DE_aggregated.csv')\n",
+    "\n",
+    "# Step 2: Filter the data for IT-related sectors\n",
+    "it_sectors = ['IT Operations & Helpdesk', 'Information Design & Documentation', 'Software Development']\n",
+    "filtered_data = data[data['display_name'].isin(it_sectors)]\n",
+    "\n",
+    "# Step 3: Convert the 'date' column to datetime format\n",
+    "filtered_data['date'] = pd.to_datetime(filtered_data['date'])\n",
+    "\n",
+    "# Step 4: Sort the data by 'display_name' and 'date'\n",
+    "filtered_data = filtered_data.sort_values(by=['display_name', 'date'])\n",
+    "\n",
+    "# Step 5: Calculate the month-over-month percentage change\n",
+    "filtered_data['mom_percentage_change'] = filtered_data.groupby('display_name')['indeed_job_postings_index'].pct_change() * 100\n",
+    "\n",
+    "# Step 6: Pivot the table to display each month as a column\n",
+    "pivot_table = filtered_data.pivot(index='display_name', columns='date', values='mom_percentage_change')\n",
+    "\n",
+    "# Step 7: Rename columns to 'YYYY-MM MoM' format\n",
+    "pivot_table.columns = [f\"{col.strftime('%Y-%m')} MoM\" for col in pivot_table.columns]\n",
+    "\n",
+    "# Step 8: Save the result to a CSV file (optional) or display it\n",
+    "pivot_table.to_csv('it_sectors_mom_percentage_change.csv')\n",
+    "\n",
+    "# To display the result instead of saving:\n",
+    "print(pivot_table)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/it_sectors_mom_percentage_change.csv b/it_sectors_mom_percentage_change.csv
@@ -0,0 +1,4 @@
+display_name,2020-02 MoM,2020-03 MoM,2020-04 MoM,2020-05 MoM,2020-06 MoM,2020-07 MoM,2020-08 MoM,2020-09 MoM,2020-10 MoM,2020-11 MoM,2020-12 MoM,2021-01 MoM,2021-02 MoM,2021-03 MoM,2021-04 MoM,2021-05 MoM,2021-06 MoM,2021-07 MoM,2021-08 MoM,2021-09 MoM,2021-10 MoM,2021-11 MoM,2021-12 MoM,2022-01 MoM,2022-02 MoM,2022-03 MoM,2022-04 MoM,2022-05 MoM,2022-06 MoM,2022-07 MoM,2022-08 MoM,2022-09 MoM,2022-10 MoM,2022-11 MoM,2022-12 MoM,2023-01 MoM,2023-02 MoM,2023-03 MoM,2023-04 MoM,2023-05 MoM,2023-06 MoM,2023-07 MoM,2023-08 MoM,2023-09 MoM,2023-10 MoM,2023-11 MoM,2023-12 MoM,2024-01 MoM,2024-02 MoM,2024-03 MoM,2024-04 MoM,2024-05 MoM,2024-06 MoM,2024-07 MoM,2024-08 MoM,2024-09 MoM
+IT Operations & Helpdesk,,-2.3291898287425394,-14.790635013108055,-1.554613475440414,1.3663142858270305,2.055617084141903,3.997909689428658,-2.0368027053692783,2.5853476995377545,4.68725098389875,4.059843337727997,4.105761467152247,-0.22004067735521549,2.7439552916949106,2.1247146029135244,6.918952990988858,5.4597707758356595,5.580694775381745,7.7933814093874565,1.821773292741291,3.6860982468136294,6.237841132705024,4.566084608555676,2.4479359198753325,-1.5520121685010269,1.821499065229415,1.0389002578059525,4.021019166618323,0.539642172798005,0.27479059466108957,1.8491060401878645,-3.506680180595234,-1.162100068561045,4.537507213668879,-1.3303100684590508,-2.379070176950371,-0.7777586493319166,-1.9743844596568638,-3.9439950545186675,-2.488442487390885,-0.9251177100842445,-0.3971338669034985,0.10928635158615752,-6.245698349274864,-0.4894780303438706,1.4721756890277549,-0.19113999315319985,1.1799509876927905,-6.543266928434887,-5.490616204581855,-4.599388271099936,0.6813371737198537,-1.9893268838970135,-2.6341122185931876,-0.7641725917131192,-5.143268384628829
+Information Design & Documentation,,-3.1324719815493007,-17.24955653793756,-7.670944975300076,0.8280421663071191,2.4564497505784155,1.4938748594472528,-0.8365819797306662,3.734546723729726,7.375325427256119,3.2681897504815316,7.18161436614988,3.88549537702616,5.925688477295088,2.5114883279825806,4.198058314978215,4.911686790987235,6.502114549225868,9.473572258653572,2.5458918310526713,4.299912216508872,7.027783070213123,2.657035163743937,3.456696025293593,2.8012673069282457,1.4648327998608934,0.8531927695084596,0.05778664370765796,-1.1112613012971395,-2.5444154052423285,-0.2999235300672698,-3.7057714482098603,-1.358795010122249,3.731457848941422,-4.604704487155875,-3.569971398402849,-3.054291564368661,-2.2861678273443653,-5.2170504009420915,-6.530879076224649,-2.592918194155569,-1.6173855114149038,-0.6056712614141757,-3.9789662390582237,-2.722769671757397,2.239025983024745,-4.68692105608377,-0.9307068989196532,-5.259325544467308,-7.330200301136092,-4.946442109884098,-4.8767462078044606,-2.345676718735301,-1.9960629436909016,-0.6503010796939157,-4.062790643146874
+Software Development,,-3.927253458572866,-14.060044855696274,-3.7868162692846985,2.4127597716364635,2.3804992546071713,3.12872744171635,-0.13763977083401935,3.2942699487363614,3.5718298852416064,3.8967505735408636,4.610424539312774,0.9550558942737375,4.1918483181558885,2.795473132098225,4.435315634341808,4.780142047081726,6.532543000776503,7.475139312213308,2.774865443804697,5.064399374293682,3.674628299906857,2.73805809053127,0.814656764868249,1.2530731066295653,1.146081180538494,1.9261856228280694,2.2705398363481955,-0.3909630886720139,-2.014219592277211,-0.7120995674844921,-2.586733797010321,-0.8564815886692778,0.23384860111184125,-2.2129559754374917,-4.412172786976221,-0.7458515453967474,-3.6843500549597064,-5.577528448326774,-5.506813417999501,-2.455854912749811,-1.5512380483792998,-1.54388112920808,-4.874707289757685,0.10606446184970864,-2.189291383982994,-2.5040720168968633,-1.1784279218946314,-6.573449422276378,-7.139246259354626,-4.977984678597302,-2.926106925270866,-3.888863572497292,-3.0325970699343063,-1.410654475221862,-4.304687629155712
diff --git a/job_postings.db b/job_postings.db