From 927903934f9dd468810a24e3d90601af758ed8bc Mon Sep 17 00:00:00 2001
From: Gabrielsaa15 <dg.saavedraal@alum.up.edu.pe>
Date: Wed, 20 Aug 2025 14:42:06 -0500
Subject: [PATCH 1/4] Add files via upload

---
 Labs/Python_Notebooks/LAB2/prac.ipynb | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb

diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb
new file mode 100644
index 000000000..e69de29bb

From 54269c2b73f366ef846943e6565e72b64f4f7071 Mon Sep 17 00:00:00 2001
From: Gabrielsaa15 <dg.saavedraal@alum.up.edu.pe>
Date: Wed, 20 Aug 2025 14:50:23 -0500
Subject: [PATCH 2/4] Revert "Add files via upload"

This reverts commit 927903934f9dd468810a24e3d90601af758ed8bc.
---
 Labs/Python_Notebooks/LAB2/prac.ipynb | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb

diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb
deleted file mode 100644
index e69de29bb..000000000

From 40d664a33bb3eb067b99f45658e38266bfc8e7fc Mon Sep 17 00:00:00 2001
From: Gabrielsaa15 <dg.saavedraal@alum.up.edu.pe>
Date: Wed, 20 Aug 2025 14:52:39 -0500
Subject: [PATCH 3/4] Delete Labs/Python_Notebooks/LAB2/prac.ipynb

---
 Labs/Python_Notebooks/LAB2/prac.ipynb | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb

diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb
deleted file mode 100644
index e69de29bb..000000000

From 43f5eb0876734278b49d5b99b2eb452f734b43d2 Mon Sep 17 00:00:00 2001
From: Gabrielsaa15 <dg.saavedraal@alum.up.edu.pe>
Date: Wed, 20 Aug 2025 14:53:33 -0500
Subject: [PATCH 4/4] Add files via upload

Tarea en laboratorio
---
 .../LAB2/diego_saavedra_lab2_I.ipynb          | 419 ++++++++++++++++++
 1 file changed, 419 insertions(+)
 create mode 100644 Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb

diff --git a/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb b/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb
new file mode 100644
index 000000000..95456a5e2
--- /dev/null
+++ b/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb
@@ -0,0 +1,419 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5b238174",
+   "metadata": {},
+   "source": [
+    "# Web Scrapping: Selenium"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4e3b17c",
+   "metadata": {},
+   "source": [
+    "## 1. Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "dc2d3ffc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import time\n",
+    "\n",
+    "# Herramientas de Selenium\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.chrome.options import Options\n",
+    "from selenium.webdriver.support.ui import WebDriverWait\n",
+    "from selenium.webdriver.support import expected_conditions as EC\n",
+    "from selenium.common.exceptions import NoSuchElementException, TimeoutException\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f8e0e4a",
+   "metadata": {},
+   "source": [
+    "## 2.  Configuración e Inicialización del Navegador\n",
+    "\n",
+    "Aquí configuramos y lanzamos el navegador Chrome que será controlado por nuestro script. Dejamos que el Selenium Manager integrado se encargue de gestionar el chromedriver por nosotros, lo que simplifica mucho la configuración."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b2aaba9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WebDriver iniciado con éxito.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Configuramos las opciones de Chrome\n",
+    "chrome_options = Options()\n",
+    "# chrome_options.add_argument(\"--headless\")\n",
+    "chrome_options.add_argument(\"--start-maximized\")\n",
+    "chrome_options.add_argument(\"--lang=en-US\")\n",
+    "\n",
+    "# Iniciar el WebDriver de Chrome\n",
+    "# Pista: Crea una variable llamada 'driver' y asígnale la instancia de webdriver.Chrome(),\n",
+    "# pasando nuestras 'chrome_options' como argumento.\n",
+    "driver =  webdriver.Chrome(options=chrome_options)\n",
+    "\n",
+    "print(\"WebDriver iniciado con éxito.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "27f43e70",
+   "metadata": {},
+   "source": [
+    "## 3. Navegar a la Página de IMdb\n",
+    "\n",
+    "Navegamos a la URL del Top 250 de IMDb. El paso más importante aquí es usar WebDriverWait. Le decimos a Selenium que espere hasta 10 segundos a que la lista de películas sea visible en la página antes de intentar hacer cualquier cosa. Esto hace nuestro script robusto frente a conexiones lentas."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a92ee23e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Esperando a que la lista de películas cargue...\n",
+      "Lista de películas encontrada. ¡A scrapear!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# URL del Top 250 de IMDb\n",
+    "url = \"https://www.imdb.com/chart/top/\"\n",
+    "\n",
+    "# Pista: El objeto 'driver' tiene un método para abrir una URL. ¿Cuál es?\n",
+    "driver.get(url)  # solo la URL\n",
+    "# Lista para guardar los datos de cada película\n",
+    "movies_data = []\n",
+    "\n",
+    "# Selector CSS para la lista que contiene todas las películas\n",
+    "movie_list_selector = \"ul.ipc-metadata-list\"\n",
+    "\n",
+    "try:\n",
+    "    print(\"Esperando a que la lista de películas cargue...\")\n",
+    "    \n",
+    "    # Completa la espera para que el script se detenga hasta que la lista de películas sea visible.\n",
+    "    # Pista: Usa EC.visibility_of_element_located() y pásale una tupla con el método de búsqueda (By) y el selector.\n",
+    "    WebDriverWait(driver, 10).until(\n",
+    "        EC.visibility_of_element_located(( By.CSS_SELECTOR, movie_list_selector))\n",
+    "    )\n",
+    "    print(\"Lista de películas encontrada. ¡A scrapear!\")\n",
+    "\n",
+    "except TimeoutException:\n",
+    "    print(\"Error: La lista de películas no cargó a tiempo. El script se detendrá.\")\n",
+    "    driver.quit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98b449d3",
+   "metadata": {},
+   "source": [
+    "## 4. Bucle Principal de Scraping\n",
+    "\n",
+    "Este es el núcleo de nuestro scraper.\n",
+    "\n",
+    "- Localizamos todos los elementos <*li> que contienen la información de cada película.\n",
+    "- Iteramos sobre los primeros 50 elementos de esa lista.\n",
+    "- Dentro de cada <*li>, buscamos los datos específicos (rango, título, año, calificación y URL) usando selectores CSS.\n",
+    "- Usamos bloques try-except para cada atributo. Si un dato no se encuentra en una película, el script registrará \"No disponible\" y continuará, en lugar de detenerse por un error."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "4d0d1eca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Scraped: #1 The Shawshank Redemption\n",
+      " Scraped: #2 The Godfather\n",
+      " Scraped: #3 The Dark Knight\n",
+      " Scraped: #4 The Godfather Part II\n",
+      " Scraped: #5 12 Angry Men\n",
+      " Scraped: #6 The Lord of the Rings: The Return of the King\n",
+      " Scraped: #7 Schindler's List\n",
+      " Scraped: #8 Pulp Fiction\n",
+      " Scraped: #9 The Lord of the Rings: The Fellowship of the Ring\n",
+      " Scraped: #10 The Good, the Bad and the Ugly\n",
+      " Scraped: #11 Forrest Gump\n",
+      " Scraped: #12 The Lord of the Rings: The Two Towers\n",
+      " Scraped: #13 Fight Club\n",
+      " Scraped: #14 Inception\n",
+      " Scraped: #15 Star Wars: Episode V - The Empire Strikes Back\n",
+      " Scraped: #16 The Matrix\n",
+      " Scraped: #17 Goodfellas\n",
+      " Scraped: #18 Interstellar\n",
+      " Scraped: #19 One Flew Over the Cuckoo's Nest\n",
+      " Scraped: #20 Se7en\n",
+      " Scraped: #21 It's a Wonderful Life\n",
+      " Scraped: #22 The Silence of the Lambs\n",
+      " Scraped: #23 Seven Samurai\n",
+      " Scraped: #24 Saving Private Ryan\n",
+      " Scraped: #25 The Green Mile\n",
+      " Scraped: #26 City of God\n",
+      " Scraped: #27 Life Is Beautiful\n",
+      " Scraped: #28 Terminator 2: Judgment Day\n",
+      " Scraped: #29 Star Wars: Episode IV - A New Hope\n",
+      " Scraped: #30 Back to the Future\n",
+      " Scraped: #31 Spirited Away\n",
+      " Scraped: #32 The Pianist\n",
+      " Scraped: #33 Gladiator\n",
+      " Scraped: #34 Parasite\n",
+      " Scraped: #35 Psycho\n",
+      " Scraped: #36 The Lion King\n",
+      " Scraped: #37 Grave of the Fireflies\n",
+      " Scraped: #38 The Departed\n",
+      " Scraped: #39 Whiplash\n",
+      " Scraped: #40 Harakiri\n",
+      " Scraped: #41 The Prestige\n",
+      " Scraped: #42 American History X\n",
+      " Scraped: #43 Léon: The Professional\n",
+      " Scraped: #44 Spider-Man: Across the Spider-Verse\n",
+      " Scraped: #45 Casablanca\n",
+      " Scraped: #46 Cinema Paradiso\n",
+      " Scraped: #47 The Usual Suspects\n",
+      " Scraped: #48 The Intouchables\n",
+      " Scraped: #49 Alien\n",
+      " Scraped: #50 Modern Times\n",
+      "\n",
+      "Scraping completado. Se extrajeron datos de 50 películas.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Selector para cada item (película) en la lista\n",
+    "movie_item_selector = \"li.ipc-metadata-list-summary-item\"\n",
+    "\n",
+    "# Pista: Usa el método 'find_elements' del driver para obtener una lista de todos los elementos que coincidan con 'movie_item_selector'.\n",
+    "movie_elements =  driver.find_elements(By.CSS_SELECTOR, movie_item_selector)\n",
+    "\n",
+    "# Iteramos solo sobre las primeras 50 películas\n",
+    "for movie in movie_elements[:50]:\n",
+    "    try:\n",
+    "        # --- Rango y Título ---\n",
+    "        # Pista: Primero, encuentra el elemento h3 con la clase 'ipc-title__text'. Luego, obtén su '.text'.\n",
+    "        title_element = movie.find_element(By.CSS_SELECTOR, \"h3.ipc-title__text\")\n",
+    "        full_title_text = title_element.text.strip()                 # ej. \"1. The Shawshank Redemption\"\n",
+    "        rank, title = full_title_text.split(\". \", 1)                 # -> \"1\", \"The Shawshank Redemption\"\n",
+    "\n",
+    "        # --- Año --- \n",
+    "        year = movie.find_element(By.CSS_SELECTOR, \"div.cli-title-metadata > span\").text\n",
+    "        \n",
+    "        # --- Calificación --- \n",
+    "        rating_element = movie.find_element(By.CSS_SELECTOR, \"span.ipc-rating-star\")\n",
+    "        rating = rating_element.text.split('\\n')[0]\n",
+    "\n",
+    "        # --- URL de la película ---\n",
+    "        # Pista: El enlace está en el atributo 'href' de la etiqueta <a>. Usa '.get_attribute()'\n",
+    "        url_element = movie.find_element(By.CSS_SELECTOR, \"a.ipc-title-link-wrapper\")\n",
+    "        movie_url = url_element.get_attribute(\"href\")   # <- AQUÍ\n",
+    "        # Asegúrate de que los nombres de las variables coincidan con las que creaste arriba.\n",
+    "        movies_data.append({\n",
+    "            \"Rango\": int(rank),         \n",
+    "            \"Titulo\": title,             \n",
+    "            \"Año\": year,\n",
+    "            \"Calificacion_IMDb\": rating,\n",
+    "            \"URL\": movie_url\n",
+    "        })\n",
+    "        print(f\" Scraped: #{rank} {title}\")\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\" Error extrayendo datos de una película. Error: {e}\")\n",
+    "        continue\n",
+    "\n",
+    "print(f\"\\nScraping completado. Se extrajeron datos de {len(movies_data)} películas.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e593f83e",
+   "metadata": {},
+   "source": [
+    "## 5. Crear el DataFrame y Guardar los Datos\n",
+    "\n",
+    "Una vez que tenemos nuestra lista de diccionarios, la convertimos en un DataFrame de pandas, que es una estructura tipo tabla ideal para el análisis y almacenamiento de datos. Finalmente, lo guardamos en un archivo CSV y cerramos el navegador para liberar los recursos del sistema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "318ab0d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " Datos guardados exitosamente.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Rango</th>\n",
+       "      <th>Titulo</th>\n",
+       "      <th>Año</th>\n",
+       "      <th>Calificacion_IMDb</th>\n",
+       "      <th>URL</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>The Shawshank Redemption</td>\n",
+       "      <td>1994</td>\n",
+       "      <td>9.3</td>\n",
+       "      <td>https://www.imdb.com/title/tt0111161/?ref_=cht...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>The Godfather</td>\n",
+       "      <td>1972</td>\n",
+       "      <td>9.2</td>\n",
+       "      <td>https://www.imdb.com/title/tt0068646/?ref_=cht...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>The Dark Knight</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>9.1</td>\n",
+       "      <td>https://www.imdb.com/title/tt0468569/?ref_=cht...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>The Godfather Part II</td>\n",
+       "      <td>1974</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>https://www.imdb.com/title/tt0071562/?ref_=cht...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>12 Angry Men</td>\n",
+       "      <td>1957</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>https://www.imdb.com/title/tt0050083/?ref_=cht...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Rango                    Titulo   Año Calificacion_IMDb  \\\n",
+       "0      1  The Shawshank Redemption  1994               9.3   \n",
+       "1      2             The Godfather  1972               9.2   \n",
+       "2      3           The Dark Knight  2008               9.1   \n",
+       "3      4     The Godfather Part II  1974               9.0   \n",
+       "4      5              12 Angry Men  1957               9.0   \n",
+       "\n",
+       "                                                 URL  \n",
+       "0  https://www.imdb.com/title/tt0111161/?ref_=cht...  \n",
+       "1  https://www.imdb.com/title/tt0068646/?ref_=cht...  \n",
+       "2  https://www.imdb.com/title/tt0468569/?ref_=cht...  \n",
+       "3  https://www.imdb.com/title/tt0071562/?ref_=cht...  \n",
+       "4  https://www.imdb.com/title/tt0050083/?ref_=cht...  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Navegador cerrado correctamente. ¡Ejercicio terminado!\n"
+     ]
+    }
+   ],
+   "source": [
+    "if movies_data:\n",
+    "    # Pista: Llama a pd.DataFrame() y pásale la lista que contiene todos nuestros datos.\n",
+    "    df = pd.DataFrame(movies_data)\n",
+    "    # Pista: Usa el método '.to_csv()'. Dale un nombre de archivo, por ejemplo, \"imdb_top_50.csv\", y no te olvides de poner index=False.\n",
+    "    [df.to_csv(\"imdb_top_50.csv\", index=False)]\n",
+    "    \n",
+    "    print(\"\\n Datos guardados exitosamente.\")\n",
+    "    display(df.head())\n",
+    "else:\n",
+    "    print(\"\\nNo se pudo extraer ningún dato de las películas.\")\n",
+    "\n",
+    "# Cerrar el navegador\n",
+    "# Pista: Hay un método en el objeto 'driver' para cerrar todas las ventanas y terminar la sesión.\n",
+    "[driver.quit()]\n",
+    "\n",
+    "print(\"\\nNavegador cerrado correctamente. ¡Ejercicio terminado!\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "yelp_scraper",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}