From 927903934f9dd468810a24e3d90601af758ed8bc Mon Sep 17 00:00:00 2001 From: Gabrielsaa15 Date: Wed, 20 Aug 2025 14:42:06 -0500 Subject: [PATCH 1/4] Add files via upload --- Labs/Python_Notebooks/LAB2/prac.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb new file mode 100644 index 000000000..e69de29bb From 54269c2b73f366ef846943e6565e72b64f4f7071 Mon Sep 17 00:00:00 2001 From: Gabrielsaa15 Date: Wed, 20 Aug 2025 14:50:23 -0500 Subject: [PATCH 2/4] Revert "Add files via upload" This reverts commit 927903934f9dd468810a24e3d90601af758ed8bc. --- Labs/Python_Notebooks/LAB2/prac.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb deleted file mode 100644 index e69de29bb..000000000 From 40d664a33bb3eb067b99f45658e38266bfc8e7fc Mon Sep 17 00:00:00 2001 From: Gabrielsaa15 Date: Wed, 20 Aug 2025 14:52:39 -0500 Subject: [PATCH 3/4] Delete Labs/Python_Notebooks/LAB2/prac.ipynb --- Labs/Python_Notebooks/LAB2/prac.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 Labs/Python_Notebooks/LAB2/prac.ipynb diff --git a/Labs/Python_Notebooks/LAB2/prac.ipynb b/Labs/Python_Notebooks/LAB2/prac.ipynb deleted file mode 100644 index e69de29bb..000000000 From 43f5eb0876734278b49d5b99b2eb452f734b43d2 Mon Sep 17 00:00:00 2001 From: Gabrielsaa15 Date: Wed, 20 Aug 2025 14:53:33 -0500 Subject: [PATCH 4/4] Add files via upload Tarea en laboratorio --- .../LAB2/diego_saavedra_lab2_I.ipynb | 419 ++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb diff --git a/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb b/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb new file mode 100644 index 000000000..95456a5e2 --- /dev/null +++ b/Labs/Python_Notebooks/LAB2/diego_saavedra_lab2_I.ipynb @@ -0,0 +1,419 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5b238174", + "metadata": {}, + "source": [ + "# Web Scrapping: Selenium" + ] + }, + { + "cell_type": "markdown", + "id": "d4e3b17c", + "metadata": {}, + "source": [ + "## 1. Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dc2d3ffc", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import time\n", + "\n", + "# Herramientas de Selenium\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import NoSuchElementException, TimeoutException\n" + ] + }, + { + "cell_type": "markdown", + "id": "5f8e0e4a", + "metadata": {}, + "source": [ + "## 2. Configuración e Inicialización del Navegador\n", + "\n", + "Aquí configuramos y lanzamos el navegador Chrome que será controlado por nuestro script. Dejamos que el Selenium Manager integrado se encargue de gestionar el chromedriver por nosotros, lo que simplifica mucho la configuración." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b2aaba9b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WebDriver iniciado con éxito.\n" + ] + } + ], + "source": [ + "# Configuramos las opciones de Chrome\n", + "chrome_options = Options()\n", + "# chrome_options.add_argument(\"--headless\")\n", + "chrome_options.add_argument(\"--start-maximized\")\n", + "chrome_options.add_argument(\"--lang=en-US\")\n", + "\n", + "# Iniciar el WebDriver de Chrome\n", + "# Pista: Crea una variable llamada 'driver' y asígnale la instancia de webdriver.Chrome(),\n", + "# pasando nuestras 'chrome_options' como argumento.\n", + "driver = webdriver.Chrome(options=chrome_options)\n", + "\n", + "print(\"WebDriver iniciado con éxito.\")" + ] + }, + { + "cell_type": "markdown", + "id": "27f43e70", + "metadata": {}, + "source": [ + "## 3. Navegar a la Página de IMdb\n", + "\n", + "Navegamos a la URL del Top 250 de IMDb. El paso más importante aquí es usar WebDriverWait. Le decimos a Selenium que espere hasta 10 segundos a que la lista de películas sea visible en la página antes de intentar hacer cualquier cosa. Esto hace nuestro script robusto frente a conexiones lentas." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a92ee23e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Esperando a que la lista de películas cargue...\n", + "Lista de películas encontrada. ¡A scrapear!\n" + ] + } + ], + "source": [ + "# URL del Top 250 de IMDb\n", + "url = \"https://www.imdb.com/chart/top/\"\n", + "\n", + "# Pista: El objeto 'driver' tiene un método para abrir una URL. ¿Cuál es?\n", + "driver.get(url) # solo la URL\n", + "# Lista para guardar los datos de cada película\n", + "movies_data = []\n", + "\n", + "# Selector CSS para la lista que contiene todas las películas\n", + "movie_list_selector = \"ul.ipc-metadata-list\"\n", + "\n", + "try:\n", + " print(\"Esperando a que la lista de películas cargue...\")\n", + " \n", + " # Completa la espera para que el script se detenga hasta que la lista de películas sea visible.\n", + " # Pista: Usa EC.visibility_of_element_located() y pásale una tupla con el método de búsqueda (By) y el selector.\n", + " WebDriverWait(driver, 10).until(\n", + " EC.visibility_of_element_located(( By.CSS_SELECTOR, movie_list_selector))\n", + " )\n", + " print(\"Lista de películas encontrada. ¡A scrapear!\")\n", + "\n", + "except TimeoutException:\n", + " print(\"Error: La lista de películas no cargó a tiempo. El script se detendrá.\")\n", + " driver.quit()" + ] + }, + { + "cell_type": "markdown", + "id": "98b449d3", + "metadata": {}, + "source": [ + "## 4. Bucle Principal de Scraping\n", + "\n", + "Este es el núcleo de nuestro scraper.\n", + "\n", + "- Localizamos todos los elementos <*li> que contienen la información de cada película.\n", + "- Iteramos sobre los primeros 50 elementos de esa lista.\n", + "- Dentro de cada <*li>, buscamos los datos específicos (rango, título, año, calificación y URL) usando selectores CSS.\n", + "- Usamos bloques try-except para cada atributo. Si un dato no se encuentra en una película, el script registrará \"No disponible\" y continuará, en lugar de detenerse por un error." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4d0d1eca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Scraped: #1 The Shawshank Redemption\n", + " Scraped: #2 The Godfather\n", + " Scraped: #3 The Dark Knight\n", + " Scraped: #4 The Godfather Part II\n", + " Scraped: #5 12 Angry Men\n", + " Scraped: #6 The Lord of the Rings: The Return of the King\n", + " Scraped: #7 Schindler's List\n", + " Scraped: #8 Pulp Fiction\n", + " Scraped: #9 The Lord of the Rings: The Fellowship of the Ring\n", + " Scraped: #10 The Good, the Bad and the Ugly\n", + " Scraped: #11 Forrest Gump\n", + " Scraped: #12 The Lord of the Rings: The Two Towers\n", + " Scraped: #13 Fight Club\n", + " Scraped: #14 Inception\n", + " Scraped: #15 Star Wars: Episode V - The Empire Strikes Back\n", + " Scraped: #16 The Matrix\n", + " Scraped: #17 Goodfellas\n", + " Scraped: #18 Interstellar\n", + " Scraped: #19 One Flew Over the Cuckoo's Nest\n", + " Scraped: #20 Se7en\n", + " Scraped: #21 It's a Wonderful Life\n", + " Scraped: #22 The Silence of the Lambs\n", + " Scraped: #23 Seven Samurai\n", + " Scraped: #24 Saving Private Ryan\n", + " Scraped: #25 The Green Mile\n", + " Scraped: #26 City of God\n", + " Scraped: #27 Life Is Beautiful\n", + " Scraped: #28 Terminator 2: Judgment Day\n", + " Scraped: #29 Star Wars: Episode IV - A New Hope\n", + " Scraped: #30 Back to the Future\n", + " Scraped: #31 Spirited Away\n", + " Scraped: #32 The Pianist\n", + " Scraped: #33 Gladiator\n", + " Scraped: #34 Parasite\n", + " Scraped: #35 Psycho\n", + " Scraped: #36 The Lion King\n", + " Scraped: #37 Grave of the Fireflies\n", + " Scraped: #38 The Departed\n", + " Scraped: #39 Whiplash\n", + " Scraped: #40 Harakiri\n", + " Scraped: #41 The Prestige\n", + " Scraped: #42 American History X\n", + " Scraped: #43 Léon: The Professional\n", + " Scraped: #44 Spider-Man: Across the Spider-Verse\n", + " Scraped: #45 Casablanca\n", + " Scraped: #46 Cinema Paradiso\n", + " Scraped: #47 The Usual Suspects\n", + " Scraped: #48 The Intouchables\n", + " Scraped: #49 Alien\n", + " Scraped: #50 Modern Times\n", + "\n", + "Scraping completado. Se extrajeron datos de 50 películas.\n" + ] + } + ], + "source": [ + "# Selector para cada item (película) en la lista\n", + "movie_item_selector = \"li.ipc-metadata-list-summary-item\"\n", + "\n", + "# Pista: Usa el método 'find_elements' del driver para obtener una lista de todos los elementos que coincidan con 'movie_item_selector'.\n", + "movie_elements = driver.find_elements(By.CSS_SELECTOR, movie_item_selector)\n", + "\n", + "# Iteramos solo sobre las primeras 50 películas\n", + "for movie in movie_elements[:50]:\n", + " try:\n", + " # --- Rango y Título ---\n", + " # Pista: Primero, encuentra el elemento h3 con la clase 'ipc-title__text'. Luego, obtén su '.text'.\n", + " title_element = movie.find_element(By.CSS_SELECTOR, \"h3.ipc-title__text\")\n", + " full_title_text = title_element.text.strip() # ej. \"1. The Shawshank Redemption\"\n", + " rank, title = full_title_text.split(\". \", 1) # -> \"1\", \"The Shawshank Redemption\"\n", + "\n", + " # --- Año --- \n", + " year = movie.find_element(By.CSS_SELECTOR, \"div.cli-title-metadata > span\").text\n", + " \n", + " # --- Calificación --- \n", + " rating_element = movie.find_element(By.CSS_SELECTOR, \"span.ipc-rating-star\")\n", + " rating = rating_element.text.split('\\n')[0]\n", + "\n", + " # --- URL de la película ---\n", + " # Pista: El enlace está en el atributo 'href' de la etiqueta . Usa '.get_attribute()'\n", + " url_element = movie.find_element(By.CSS_SELECTOR, \"a.ipc-title-link-wrapper\")\n", + " movie_url = url_element.get_attribute(\"href\") # <- AQUÍ\n", + " # Asegúrate de que los nombres de las variables coincidan con las que creaste arriba.\n", + " movies_data.append({\n", + " \"Rango\": int(rank), \n", + " \"Titulo\": title, \n", + " \"Año\": year,\n", + " \"Calificacion_IMDb\": rating,\n", + " \"URL\": movie_url\n", + " })\n", + " print(f\" Scraped: #{rank} {title}\")\n", + "\n", + " except Exception as e:\n", + " print(f\" Error extrayendo datos de una película. Error: {e}\")\n", + " continue\n", + "\n", + "print(f\"\\nScraping completado. Se extrajeron datos de {len(movies_data)} películas.\")" + ] + }, + { + "cell_type": "markdown", + "id": "e593f83e", + "metadata": {}, + "source": [ + "## 5. Crear el DataFrame y Guardar los Datos\n", + "\n", + "Una vez que tenemos nuestra lista de diccionarios, la convertimos en un DataFrame de pandas, que es una estructura tipo tabla ideal para el análisis y almacenamiento de datos. Finalmente, lo guardamos en un archivo CSV y cerramos el navegador para liberar los recursos del sistema." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "318ab0d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Datos guardados exitosamente.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RangoTituloAñoCalificacion_IMDbURL
01The Shawshank Redemption19949.3https://www.imdb.com/title/tt0111161/?ref_=cht...
12The Godfather19729.2https://www.imdb.com/title/tt0068646/?ref_=cht...
23The Dark Knight20089.1https://www.imdb.com/title/tt0468569/?ref_=cht...
34The Godfather Part II19749.0https://www.imdb.com/title/tt0071562/?ref_=cht...
4512 Angry Men19579.0https://www.imdb.com/title/tt0050083/?ref_=cht...
\n", + "
" + ], + "text/plain": [ + " Rango Titulo Año Calificacion_IMDb \\\n", + "0 1 The Shawshank Redemption 1994 9.3 \n", + "1 2 The Godfather 1972 9.2 \n", + "2 3 The Dark Knight 2008 9.1 \n", + "3 4 The Godfather Part II 1974 9.0 \n", + "4 5 12 Angry Men 1957 9.0 \n", + "\n", + " URL \n", + "0 https://www.imdb.com/title/tt0111161/?ref_=cht... \n", + "1 https://www.imdb.com/title/tt0068646/?ref_=cht... \n", + "2 https://www.imdb.com/title/tt0468569/?ref_=cht... \n", + "3 https://www.imdb.com/title/tt0071562/?ref_=cht... \n", + "4 https://www.imdb.com/title/tt0050083/?ref_=cht... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Navegador cerrado correctamente. ¡Ejercicio terminado!\n" + ] + } + ], + "source": [ + "if movies_data:\n", + " # Pista: Llama a pd.DataFrame() y pásale la lista que contiene todos nuestros datos.\n", + " df = pd.DataFrame(movies_data)\n", + " # Pista: Usa el método '.to_csv()'. Dale un nombre de archivo, por ejemplo, \"imdb_top_50.csv\", y no te olvides de poner index=False.\n", + " [df.to_csv(\"imdb_top_50.csv\", index=False)]\n", + " \n", + " print(\"\\n Datos guardados exitosamente.\")\n", + " display(df.head())\n", + "else:\n", + " print(\"\\nNo se pudo extraer ningún dato de las películas.\")\n", + "\n", + "# Cerrar el navegador\n", + "# Pista: Hay un método en el objeto 'driver' para cerrar todas las ventanas y terminar la sesión.\n", + "[driver.quit()]\n", + "\n", + "print(\"\\nNavegador cerrado correctamente. ¡Ejercicio terminado!\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "yelp_scraper", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}