diff --git a/module-1/lab-advanced-regex/your-code/main.ipynb b/module-1/lab-advanced-regex/your-code/main.ipynb index b898da503..6a9361765 100644 --- a/module-1/lab-advanced-regex/your-code/main.ipynb +++ b/module-1/lab-advanced-regex/your-code/main.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -27,19 +27,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "text = \"This is going to be a sentence with a good number of vowels in it.\"" + "text = \"This is going to be a sentence with a good number of vowels in it.\"\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['i', 'i', 'o', 'i', 'o', 'e', 'a', 'e', 'e', 'e', 'i', 'a', 'o', 'o', 'u', 'e', 'o', 'o', 'e', 'i', 'i']\n" + ] + } + ], + "source": [ + "print(re.findall('[aeiou]',text))" + ] }, { "cell_type": "markdown", @@ -50,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -59,10 +69,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['puppy', 'puppy']\n" + ] + } + ], + "source": [ + "print(re.findall('[P-p]uppys?',text))" + ] }, { "cell_type": "markdown", @@ -73,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -82,10 +102,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ran', 'run']\n" + ] + } + ], + "source": [ + "print(re.findall('[R-r]an|[R-r]un',text))" + ] }, { "cell_type": "markdown", @@ -96,10 +126,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ran', 'relay', 'race', 'run']\n" + ] + } + ], + "source": [ + "print(re.findall('r[a-z]+',text))" + ] }, { "cell_type": "markdown", @@ -110,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -119,10 +159,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a sentence with special characters in it.\n" + ] + } + ], + "source": [ + "print(text.replace(\"!\",\"i\"))\n" + ] }, { "cell_type": "markdown", @@ -133,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -142,10 +192,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['This', 'sentence', 'words', 'varying', 'lengths']" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('[A-Z][a-z]{3,}|[a-z]{4,}',text)" + ] }, { "cell_type": "markdown", @@ -156,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -165,10 +228,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bet', 'bot', 'beat', 'bot', 'bat', 'but', 'bit']" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('b[a-z]+t',text)" + ] }, { "cell_type": "markdown", @@ -179,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -188,10 +264,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pea', 'peo', 'rea', 'brea', 'Nea']" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('\\w+[e][a]|\\w[e][o]',text)" + ] }, { "cell_type": "markdown", @@ -202,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -211,10 +300,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Teddy', 'Roosevelt', 'Abraham', 'Lincoln']" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('[A-Z][a-z]+',text)" + ] }, { "cell_type": "markdown", @@ -225,10 +327,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Teddy Roosevelt', 'Abraham Lincoln']" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('[A-Z][a-z]+ [A-Z][a-z]+',text)" + ] }, { "cell_type": "markdown", @@ -241,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -250,10 +365,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[' \"', '\" ', ' \"', '\"']" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall(' ?\" ?',text)" + ] }, { "cell_type": "markdown", @@ -264,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -273,10 +401,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['30', '30', '14', '16', '10']" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('\\d?\\d',text)" + ] }, { "cell_type": "markdown", @@ -287,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -299,10 +440,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['876-93-2289', '098-32-5295']" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('\\d+-\\d{2}-\\d+',text)" + ] }, { "cell_type": "markdown", @@ -313,10 +467,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['(847)789-0984', '(987)222-0901']" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('\\W\\d{3}\\W\\d{3}-\\d+',text)" + ] }, { "cell_type": "markdown", @@ -325,6 +492,26 @@ "### 15. Use a regular expression to find and extract all the formatted numbers (both social security and phone) from the text below." ] }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[' 876-93-2289', '(847)789-0984', ' 098-32-5295', '(987)222-0901']" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('\\W?\\d+\\W?\\d+-\\d+',text)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -349,7 +536,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-advanced-web-scraping/your-code/main.ipynb b/module-1/lab-advanced-web-scraping/your-code/main.ipynb index 257597a5c..0fa9e7ebe 100644 --- a/module-1/lab-advanced-web-scraping/your-code/main.ipynb +++ b/module-1/lab-advanced-web-scraping/your-code/main.ipynb @@ -16,7 +16,8 @@ "outputs": [], "source": [ "import requests\n", - "from bs4 import BeautifulSoup\n", + "import time\n", + "from bs4 import BeautifulSoup as bs\n", "\n", "class IronhackSpider:\n", " \"\"\"\n", @@ -68,8 +69,6 @@ "you will complete this function so that it extracts the quotes.\n", "This function will be passed to the IronhackSpider class.\n", "\"\"\"\n", - "def quotes_parser(content):\n", - " return content\n", "\n", "# Instantiate the IronhackSpider class\n", "my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)\n", @@ -95,7 +94,14 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "def quotes_parser(content):\n", + " html = bs(content,\"html.parser\")\n", + " quotes = html.find_all('span',attrs= {'itemprop':'text'})\n", + " quo_lst=[]\n", + " for el in quotes:\n", + " quo_lst.append(el.text.strip().split(\"))\n", + " return quo_lst\n" ] }, { @@ -115,7 +121,83 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import requests\n", + "from bs4 import BeautifulSoup as bs\n", + "\n", + "class IronhackSpider:\n", + " \"\"\"\n", + " This is the constructor class to which you can pass a bunch of parameters. \n", + " These parameters are stored to the class instance variables so that the\n", + " class functions can access them later.\n", + " \n", + " url_pattern: the regex pattern of the web urls to scape\n", + " pages_to_scrape: how many pages to scrape\n", + " sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.\n", + " content_parser: a function reference that will extract the intended info from the scraped content.\n", + " \"\"\"\n", + " def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=-1, content_parser=None):\n", + " self.url_pattern = url_pattern\n", + " self.pages_to_scrape = pages_to_scrape\n", + " self.sleep_interval = sleep_interval\n", + " self.content_parser = content_parser\n", + " \n", + " \"\"\"\n", + " Scrape the content of a single url.\n", + " \"\"\"\n", + " def scrape_url(self, url):\n", + " response = requests.get(url)\n", + " if response.status_code < 300:\n", + " result = self.content_parser(response.content)\n", + " self.output_results(result)\n", + " print('request was successful, status code=',response.status_code)\n", + " elif response.status_code >= 400 and response.status_code < 500:\n", + " print('request failed because the resource either does not exist or is forbidden ,status code=',response.status_code)\n", + " try:\n", + " response = requests.get(url)\n", + " except requests.exceptions.Timeout:\n", + " print(response.status_code)\n", + " except requests.exceptions.TooManyRedirects:\n", + " print(response.status_code)\n", + " except requests.exceptions.SSLError:\n", + " print(response.status_code)\n", + " except requests.exceptions.RequestException as e:\n", + " print(response.status_code)\n", + " else:\n", + " print('request failed because the response server encountered an error,status code=',response.status_code)\n", + " \n", + " \"\"\"\n", + " Export the scraped content. Right now it simply print out the results.\n", + " But in the future you can export the results into a text file or database.\n", + " \"\"\"\n", + " def output_results(self, r):\n", + " print(r)\n", + " \n", + " \"\"\"\n", + " After the class is instantiated, call this function to start the scraping jobs.\n", + " This function uses a FOR loop to call `scrape_url()` for each url to scrape.\n", + " \"\"\"\n", + " def kickstart(self):\n", + " if self.sleep_interval > 0:\n", + " return time.sleep(self.sleep_interval)\n", + " else:\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + "\n", + "\n", + "URL_PATTERN = 'http://quotes.toscrape.com/page/%s/' # regex pattern for the urls to scrape\n", + "PAGES_TO_SCRAPE = 10 # how many webpages to scrapge\n", + "\n", + "\"\"\"\n", + "This is a custom parser function you will complete in the challenge.\n", + "Right now it simply returns the string passed to it. But in this lab\n", + "you will complete this function so that it extracts the quotes.\n", + "This function will be passed to the IronhackSpider class.\n", + "\"\"\"\n", + "\n", + "# Instantiate the IronhackSpider class\n", + "my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)\n", + "my_spider.kickstart()" ] }, { @@ -135,7 +217,18 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "def kickstart(self,sleep_interval):\n", + " if self.sleep_interval > 0:\n", + " time.sleep(sleep_interval)\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + " else:\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + "\n", + "my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE,sleep_interval = 10, content_parser=quotes_parser)\n", + "my_spider.kickstart()" ] }, { @@ -155,7 +248,85 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "# your code here\n", + "import requests\n", + "from bs4 import BeautifulSoup as bs\n", + "\n", + "class IronhackSpider:\n", + " \"\"\"\n", + " This is the constructor class to which you can pass a bunch of parameters. \n", + " These parameters are stored to the class instance variables so that the\n", + " class functions can access them later.\n", + " \n", + " url_pattern: the regex pattern of the web urls to scape\n", + " pages_to_scrape: how many pages to scrape\n", + " sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.\n", + " content_parser: a function reference that will extract the intended info from the scraped content.\n", + " \"\"\"\n", + " def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=-1, content_parser=None):\n", + " self.url_pattern = url_pattern\n", + " self.pages_to_scrape = pages_to_scrape\n", + " self.sleep_interval = sleep_interval\n", + " self.content_parser = content_parser\n", + " \n", + " \"\"\"\n", + " Scrape the content of a single url.\n", + " \"\"\"\n", + " def scrape_url(self, url):\n", + " response = requests.get(url)\n", + " if response.status_code < 300:\n", + " result = self.content_parser(response.content)\n", + " self.output_results(result)\n", + " print('request was successful, status code=',response.status_code)\n", + " elif response.status_code >= 400 and response.status_code < 500:\n", + " print('request failed because the resource either does not exist or is forbidden ,status code=',response.status_code)\n", + " try:\n", + " response = requests.get(url)\n", + " except requests.exceptions.Timeout:\n", + " print(response.status_code)\n", + " except requests.exceptions.TooManyRedirects:\n", + " print(response.status_code)\n", + " except requests.exceptions.SSLError:\n", + " print(response.status_code)\n", + " except requests.exceptions.RequestException as e:\n", + " print(response.status_code)\n", + " else:\n", + " print('request failed because the response server encountered an error,status code=',response.status_code)\n", + " \n", + " \"\"\"\n", + " Export the scraped content. Right now it simply print out the results.\n", + " But in the future you can export the results into a text file or database.\n", + " \"\"\"\n", + " def output_results(self, r):\n", + " print(r)\n", + " \n", + " \"\"\"\n", + " After the class is instantiated, call this function to start the scraping jobs.\n", + " This function uses a FOR loop to call `scrape_url()` for each url to scrape.\n", + " \"\"\"\n", + " def kickstart(self):\n", + " if self.sleep_interval > 0:\n", + " return time.sleep(self.sleep_interval)\n", + " else:\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + "\n", + "\n", + "\n", + "URL_PATTERN = 'http://quotes.toscrape.com/page/%s/' # regex pattern for the urls to scrape\n", + "PAGES_TO_SCRAPE = 10 # how many webpages to scrapge\n", + "\n", + "\"\"\"\n", + "This is a custom parser function you will complete in the challenge.\n", + "Right now it simply returns the string passed to it. But in this lab\n", + "you will complete this function so that it extracts the quotes.\n", + "This function will be passed to the IronhackSpider class.\n", + "\"\"\"\n", + "\n", + "# Instantiate the IronhackSpider class\n", + "my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)\n", + "my_spider.kickstart()" ] }, { @@ -171,11 +342,130 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nThis is a custom parser function you will complete in the challenge.\\nRight now it simply returns the string passed to it. But in this lab\\nyou will complete this function so that it extracts the quotes.\\nThis function will be passed to the IronhackSpider class.\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "import requests\n", + "from bs4 import BeautifulSoup as bs\n", + "import time\n", + "import re\n", + "class IronhackSpider:\n", + " \"\"\"\n", + " This is the constructor class to which you can pass a bunch of parameters. \n", + " These parameters are stored to the class instance variables so that the\n", + " class functions can access them later.\n", + " \n", + " url_pattern: the regex pattern of the web urls to scape\n", + " pages_to_scrape: how many pages to scrape\n", + " sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.\n", + " content_parser: a function reference that will extract the intended info from the scraped content.\n", + " \"\"\"\n", + " def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=-1, content_parser=None):\n", + " self.url_pattern = url_pattern\n", + " self.pages_to_scrape = pages_to_scrape\n", + " self.sleep_interval = sleep_interval\n", + " self.content_parser = content_parser\n", + " \n", + " \"\"\"\n", + " Scrape the content of a single url.\n", + " \"\"\"\n", + " def scrape_url(self, url):\n", + " response = requests.get(url)\n", + " if response.status_code < 300:\n", + " result = self.content_parser(response.content)\n", + " self.output_results(result)\n", + " print('request was successful, status code=',response.status_code)\n", + " elif response.status_code >= 400 and response.status_code < 500:\n", + " print('request failed because the resource either does not exist or is forbidden ,status code=',response.status_code)\n", + " try:\n", + " response = requests.get(url)\n", + " except requests.exceptions.Timeout:\n", + " print(response.status_code)\n", + " except requests.exceptions.TooManyRedirects:\n", + " print(response.status_code)\n", + " except requests.exceptions.SSLError:\n", + " print(response.status_code)\n", + " except requests.exceptions.RequestException as e:\n", + " print(response.status_code)\n", + " else:\n", + " print('request failed because the response server encountered an error,status code=',response.status_code)\n", + " \n", + " \"\"\"\n", + " Export the scraped content. Right now it simply print out the results.\n", + " But in the future you can export the results into a text file or database.\n", + " \"\"\"\n", + " def output_results(self, r):\n", + " print(r)\n", + " \n", + " \"\"\"\n", + " After the class is instantiated, call this function to start the scraping jobs.\n", + " This function uses a FOR loop to call `scrape_url()` for each url to scrape.\n", + " \"\"\"\n", + " def kickstart(self):\n", + " if self.sleep_interval > 0:\n", + " return time.sleep(self.sleep_interval)\n", + " else:\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + "\n", + "\n", + "URL_PATTERN = 'http://books.toscrape.com/catalogue/page-%s.html' # regex pattern for the urls to scrape\n", + "PAGES_TO_SCRAPE = 5 # how many webpages to scrapge\n", + "\n", + "\"\"\"\n", + "This is a custom parser function you will complete in the challenge.\n", + "Right now it simply returns the string passed to it. But in this lab\n", + "you will complete this function so that it extracts the quotes.\n", + "This function will be passed to the IronhackSpider class.\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['A Light in the ...', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History ...', 'The Requiem Red', 'The Dirty Little Secrets ...', 'The Coming Woman: A ...', 'The Boys in the ...', 'The Black Maria', 'Starving Hearts (Triangular Trade ...', \"Shakespeare's Sonnets\", 'Set Me Free', \"Scott Pilgrim's Precious Little ...\", 'Rip it Up and ...', 'Our Band Could Be ...', 'Olio', 'Mesaerion: The Best Science ...', 'Libertarianism for Beginners', \"It's Only the Himalayas\"]\n", + "request was successful, status code= 200\n", + "['In Her Wake', 'How Music Works', 'Foolproof Preserving: A Guide ...', 'Chase Me (Paris Nights ...', 'Black Dust', 'Birdsong: A Story in ...', \"America's Cradle of Quarterbacks: ...\", 'Aladdin and His Wonderful ...', 'Worlds Elsewhere: Journeys Around ...', 'Wall and Piece', 'The Four Agreements: A ...', 'The Five Love Languages: ...', 'The Elephant Tree', 'The Bear and the ...', \"Sophie's World\", 'Penny Maybe', 'Maude (1883-1993):She Grew Up ...', 'In a Dark, Dark ...', 'Behind Closed Doors', \"You can't bury them ...\"]\n", + "request was successful, status code= 200\n", + "['Slow States of Collapse: ...', 'Reasons to Stay Alive', 'Private Paris (Private #10)', '#HigherSelfie: Wake Up Your ...', 'Without Borders (Wanderlove #1)', 'When We Collided', 'We Love You, Charlie ...', 'Untitled Collection: Sabbath Poems ...', 'Unseen City: The Majesty ...', 'Unicorn Tracks', 'Unbound: How Eight Technologies ...', 'Tsubasa: WoRLD CHRoNiCLE 2 ...', 'Throwing Rocks at the ...', 'This One Summer', 'Thirst', 'The Torch Is Passed: ...', 'The Secret of Dreadwillow ...', 'The Pioneer Woman Cooks: ...', 'The Past Never Ends', 'The Natural History of ...']\n", + "request was successful, status code= 200\n", + "['The Nameless City (The ...', 'The Murder That Never ...', 'The Most Perfect Thing: ...', 'The Mindfulness and Acceptance ...', 'The Life-Changing Magic of ...', 'The Inefficiency Assassin: Time ...', 'The Gutsy Girl: Escapades ...', 'The Electric Pencil: Drawings ...', 'The Death of Humanity: ...', 'The Bulletproof Diet: Lose ...', 'The Art Forger', 'The Age of Genius: ...', \"The Activist's Tao Te ...\", 'Spark Joy: An Illustrated ...', 'Soul Reader', 'Security', 'Saga, Volume 6 (Saga ...', 'Saga, Volume 5 (Saga ...', 'Reskilling America: Learning to ...', 'Rat Queens, Vol. 3: ...']\n", + "request was successful, status code= 200\n", + "['Princess Jellyfish 2-in-1 Omnibus, ...', 'Princess Between Worlds (Wide-Awake ...', 'Pop Gun War, Volume ...', 'Political Suicide: Missteps, Peccadilloes, ...', 'Patience', 'Outcast, Vol. 1: A ...', 'orange: The Complete Collection ...', 'Online Marketing for Busy ...', 'On a Midnight Clear', 'Obsidian (Lux #1)', 'My Paris Kitchen: Recipes ...', 'Masks and Shadows', 'Mama Tried: Traditional Italian ...', 'Lumberjanes, Vol. 2: Friendship ...', 'Lumberjanes, Vol. 1: Beware ...', 'Lumberjanes Vol. 3: A ...', 'Layered: Baking, Building, and ...', 'Judo: Seven Steps to ...', 'Join', 'In the Country We ...']\n", + "request was successful, status code= 200\n" + ] + } + ], + "source": [ + "def quotes_parser(content):\n", + " html = bs(content,\"html.parser\")\n", + " elements = html.find_all('h3')\n", + " books=[]\n", + " for i in elements:\n", + " books.append(i.text)\n", + " return books\n", + "# Instantiate the IronhackSpider class\n", + "my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)\n", + "my_spider.kickstart()" ] }, { @@ -235,7 +525,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-api-scavenger-game/your-code/challenge-1-2-3.ipynb b/module-1/lab-api-scavenger-game/your-code/challenge-1-2-3.ipynb new file mode 100644 index 000000000..1564ec791 --- /dev/null +++ b/module-1/lab-api-scavenger-game/your-code/challenge-1-2-3.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "##CHALLENGE-1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([None, 'Jupyter Notebook'], dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "\n", + "#Token and username must be changed\n", + "username = 'evi*****'\n", + "token = '22262120b06d80666333a157d67e324b05f*****'\n", + "\n", + "url = \"https://api.github.com/repos/ironhack-labs/data-labs/forks\"\n", + "login = requests.get(url, auth=(username,token))\n", + "response=login.json()\n", + "data_forks = pd.DataFrame(response) \n", + "\n", + "data_forks['language'].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##CHALLENGE-2" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Numbers of commits = 2\n" + ] + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "import datetime\n", + "\n", + "#Credentials\n", + "#Token and username must be changed\n", + "username = 'evi*****'\n", + "token = '22262120b06d80666333a157d67e324b05f*****'\n", + "\n", + "#variables for date calculations\n", + "now = datetime.datetime.now()\n", + "past_week= now - datetime.timedelta(days=7)\n", + "\n", + "#Connecting to API using token\n", + "url = \"https://api.github.com/repos/ironhack-labs/data-labs/commits?since=\"+str(past_week)+\"T00:00:00&until=\"+str(now)+\"T23:59:59\"\n", + "login = requests.get(url, auth=(username,token))\n", + "\n", + "#Building object to count commits in the last past week\n", + "response=login.json()\n", + "data_commit = pd.DataFrame(response)\n", + "result = data_commit['commit'].count()\n", + "print('Numbers of commits =',result)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "##CHALLENGE-3" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'In data science 80 percent of time spent is preparing data 20 percent of time is spent complaining about the need to prepare data.'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "\n", + "#Credentials\n", + "#Token and username must be changed\n", + "username = 'evi*****'\n", + "token = '22262120b06d80666333a157d67e324b05f*****'\n", + "\n", + "\n", + "#Connecting to API using token\n", + "url = \"https://api.github.com/repos/ironhack-datalabs/scavenger/contents\"\n", + "login = requests.get(url, auth=(username,token))\n", + "\n", + "#Creating a dataframe from object\n", + "response=login.json()\n", + "data_hunt = pd.DataFrame(response)\n", + "\n", + "#Creating a list of directories\n", + "path_list= [col for col in data_hunt['path']]\n", + "\n", + "\n", + "#Function to build urls\n", + "def buildURL(url):\n", + " directories=[]\n", + " for f in path_list:\n", + " file = url+\"/\"+f\n", + " directories.append(file)\n", + " return directories\n", + "\n", + "#Open diretory and read files\n", + "dir_lst = buildURL(url)\n", + "del dir_lst[0]\n", + "dir_lst\n", + "\n", + "def openDir(dirs):\n", + " username = 'evi*****'\n", + " token = '22262120b06d80666333a157d67e324b05f*****'\n", + " file_df= []\n", + " \n", + " for file in dir_lst:\n", + " url = file\n", + " login = requests.get(url, auth=(username,token))\n", + " response=login.json()\n", + " file_hunt = pd.DataFrame(response)\n", + " file_df.append(file_hunt)\n", + " return pd.concat(file_df)\n", + " \n", + "file_scavenger = openDir(dir_lst)\n", + "\n", + "#Get files ending in 'scavengerhunt'\n", + "def listSca(dataframe):\n", + " return dataframe.loc[dataframe['name'].str.endswith('scavengerhunt')]\n", + "\n", + "finale = listSca(file_scavenger).sort_values(by=['name'])\n", + "\n", + "#Reset dataframe´s index\n", + "finale = finale['download_url'].reset_index()\n", + "_lst=finale.download_url\n", + "\n", + "#Opens the RAW url to retrieve content in 'scavenger' files and concatenates the strings of every file\n", + "#into a single list.\n", + "\n", + "def openRaw(_lst):\n", + " username = 'evi*****'\n", + " token = '22262120b06d80666333a157d67e324b05f*****'\n", + "\n", + " contents = [] \n", + " seperator = ', '\n", + "\n", + " for url in _lst: \n", + " word = (requests.get(url).text).strip()\n", + " contents.append(word)\n", + " \n", + " return seperator.join(contents).replace(',','')\n", + "\n", + "openRaw(_lst)\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/module-1/lab-data_cleaning/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/module-1/lab-data_cleaning/your-code/.ipynb_checkpoints/main-checkpoint.ipynb index 31724c58f..a71966474 100644 --- a/module-1/lab-data_cleaning/your-code/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/module-1/lab-data_cleaning/your-code/.ipynb_checkpoints/main-checkpoint.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pymysql\n", + "from sqlalchemy import create_engine" + ] }, { "cell_type": "markdown", @@ -37,10 +42,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz/stats')" + ] }, { "cell_type": "markdown", @@ -51,10 +58,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user = pd.read_sql_query('SELECT * FROM stats.users', engine)" + ] }, { "cell_type": "markdown", @@ -65,10 +74,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user.rename(columns = {'Id':'userId'}, inplace= True)\n" + ] }, { "cell_type": "markdown", @@ -79,10 +90,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 27, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_posts = pd.read_sql_query('SELECT * FROM stats.posts', engine)" + ] }, { "cell_type": "markdown", @@ -93,10 +106,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 28, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_posts.rename(columns = {'Id':'postId','OwnerUserId':'userId'}, inplace = True)" + ] }, { "cell_type": "markdown", @@ -109,10 +124,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user2 = data_user[['userId', 'Reputation','Views','UpVotes','DownVotes']]\n", + "\n", + "data_posts2 = data_posts[['postId', 'Score','userId','ViewCount','CommentCount']]\n", + "\n" + ] }, { "cell_type": "markdown", @@ -124,10 +144,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 58, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge = pd.merge(data_user2, data_posts2)\n" + ] }, { "cell_type": "markdown", @@ -138,10 +160,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 48396\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] }, { "cell_type": "markdown", @@ -153,10 +197,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#i will fill them by taking the average of column 'ViewCount' and using that value\n", + "data_merge['ViewCount']=data_merge['ViewCount'].fillna(data_merge['ViewCount'].mean())" + ] }, { "cell_type": "markdown", @@ -167,10 +214,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount int64\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "convert_dict = {'ViewCount': int, \n", + " } \n", + " \n", + "data_merge = data_merge.astype(convert_dict) \n", + "data_merge.dtypes\n" + ] }, { "cell_type": "markdown", @@ -178,6 +251,80 @@ "source": [ "#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder." ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " userId Reputation Views UpVotes DownVotes \\\n", + "count 90584.000000 90584.000000 90584.000000 90584.000000 90584.000000 \n", + "mean 16546.764727 6282.395412 1034.245176 734.315718 33.273249 \n", + "std 15273.367108 15102.268670 2880.074012 2050.869327 134.936435 \n", + "min -1.000000 1.000000 0.000000 0.000000 0.000000 \n", + "25% 3437.000000 60.000000 5.000000 1.000000 0.000000 \n", + "50% 11032.000000 396.000000 45.000000 22.000000 0.000000 \n", + "75% 27700.000000 4460.000000 514.250000 283.000000 8.000000 \n", + "max 55746.000000 87393.000000 20932.000000 11442.000000 1920.000000 \n", + "\n", + " postId Score ViewCount CommentCount \n", + "count 90584.000000 90584.000000 90584.000000 90584.000000 \n", + "mean 56539.080522 2.780767 556.305595 1.894650 \n", + "std 33840.307529 4.948922 1608.469419 2.638704 \n", + "min 1.000000 -19.000000 1.000000 0.000000 \n", + "25% 26051.750000 1.000000 144.000000 0.000000 \n", + "50% 57225.500000 2.000000 556.000000 1.000000 \n", + "75% 86145.250000 3.000000 556.000000 3.000000 \n", + "max 115378.000000 192.000000 175495.000000 45.000000 \n" + ] + } + ], + "source": [ + "outliers = data_merge.describe()\n", + "print(outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 249, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "outliers are for ['userId'] : -32957.5 -8694.5\n", + "outliers are for ['Reputation'] : -6540.0 -2140.0\n", + "outliers are for ['Views'] : -758.875 -249.625\n", + "outliers are for ['UpVotes'] : -422.0 -140.0\n", + "outliers are for ['DownVotes'] : -12.0 -4.0\n", + "outliers are for ['postId'] : -64088.5 -3995.0\n", + "outliers are for ['Score'] : -2.0 0.0\n", + "outliers are for ['ViewCount'] : -474.0 -62.0\n", + "outliers are for ['CommentCount'] : -4.5 -1.5\n" + ] + } + ], + "source": [ + "columns = list(outliers)\n", + "for i in columns:\n", + " q3 = outliers[i][6]\n", + " q1 = outliers[i][4]\n", + " iqr = q3-q1\n", + " print(\"outliers are for\",[i],\":\",q1 -(1.5 * iqr),q3 -(1.5 * iqr))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -196,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-data_cleaning/your-code/main.ipynb b/module-1/lab-data_cleaning/your-code/main.ipynb index 31724c58f..cfdfe5a33 100644 --- a/module-1/lab-data_cleaning/your-code/main.ipynb +++ b/module-1/lab-data_cleaning/your-code/main.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pymysql\n", + "from sqlalchemy import create_engine" + ] }, { "cell_type": "markdown", @@ -37,10 +42,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz/stats')" + ] }, { "cell_type": "markdown", @@ -51,10 +58,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user = pd.read_sql_query('SELECT * FROM stats.users', engine)" + ] }, { "cell_type": "markdown", @@ -65,10 +74,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user.rename(columns = {'Id':'userId'}, inplace= True)\n" + ] }, { "cell_type": "markdown", @@ -79,10 +90,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 27, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_posts = pd.read_sql_query('SELECT * FROM stats.posts', engine)" + ] }, { "cell_type": "markdown", @@ -93,10 +106,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 28, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_posts.rename(columns = {'Id':'postId','OwnerUserId':'userId'}, inplace = True)" + ] }, { "cell_type": "markdown", @@ -109,10 +124,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_user2 = data_user[['userId', 'Reputation','Views','UpVotes','DownVotes']]\n", + "\n", + "data_posts2 = data_posts[['postId', 'Score','userId','ViewCount','CommentCount']]\n", + "\n" + ] }, { "cell_type": "markdown", @@ -124,10 +144,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 58, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge = pd.merge(data_user2, data_posts2)\n" + ] }, { "cell_type": "markdown", @@ -138,10 +160,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 48396\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] }, { "cell_type": "markdown", @@ -153,10 +197,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#i will fill them by taking the average of column 'ViewCount' and using that value\n", + "data_merge['ViewCount']=data_merge['ViewCount'].fillna(data_merge['ViewCount'].mean())" + ] }, { "cell_type": "markdown", @@ -167,10 +214,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount int64\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "convert_dict = {'ViewCount': int, \n", + " } \n", + " \n", + "data_merge = data_merge.astype(convert_dict) \n", + "data_merge.dtypes\n" + ] }, { "cell_type": "markdown", @@ -178,6 +251,77 @@ "source": [ "#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder." ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " userId Reputation Views UpVotes DownVotes \\\n", + "count 90584.000000 90584.000000 90584.000000 90584.000000 90584.000000 \n", + "mean 16546.764727 6282.395412 1034.245176 734.315718 33.273249 \n", + "std 15273.367108 15102.268670 2880.074012 2050.869327 134.936435 \n", + "min -1.000000 1.000000 0.000000 0.000000 0.000000 \n", + "25% 3437.000000 60.000000 5.000000 1.000000 0.000000 \n", + "50% 11032.000000 396.000000 45.000000 22.000000 0.000000 \n", + "75% 27700.000000 4460.000000 514.250000 283.000000 8.000000 \n", + "max 55746.000000 87393.000000 20932.000000 11442.000000 1920.000000 \n", + "\n", + " postId Score ViewCount CommentCount \n", + "count 90584.000000 90584.000000 90584.000000 90584.000000 \n", + "mean 56539.080522 2.780767 556.305595 1.894650 \n", + "std 33840.307529 4.948922 1608.469419 2.638704 \n", + "min 1.000000 -19.000000 1.000000 0.000000 \n", + "25% 26051.750000 1.000000 144.000000 0.000000 \n", + "50% 57225.500000 2.000000 556.000000 1.000000 \n", + "75% 86145.250000 3.000000 556.000000 3.000000 \n", + "max 115378.000000 192.000000 175495.000000 45.000000 \n" + ] + } + ], + "source": [ + "outliers = data_merge.describe()\n", + "print(outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 272, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-32957.5, -6540.0, -758.875, -422.0, -12.0, -64088.5, -2.0, -474.0, -4.5] [-8694.5, -2140.0, -249.625, -140.0, -4.0, -3995.0, 0.0, -62.0, -1.5]\n" + ] + } + ], + "source": [ + "columns = list(outliers)\n", + "bounds_low =[]\n", + "bounds_upper=[]\n", + "for i in columns:\n", + " q3 = outliers[i][6]\n", + " q1 = outliers[i][4]\n", + " iqr = q3-q1\n", + " low= q1 -(1.5 * iqr)\n", + " upper= q3 -(1.5 * iqr)\n", + " bounds_low.append(low)\n", + " bounds_upper.append(upper)\n", + "print(bounds_low,bounds_upper)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -196,7 +340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-data_cleaning/your-code/weather.ipynb b/module-1/lab-data_cleaning/your-code/weather.ipynb index 4fc40abb3..1e7b0d222 100644 --- a/module-1/lab-data_cleaning/your-code/weather.ipynb +++ b/module-1/lab-data_cleaning/your-code/weather.ipynb @@ -61,7 +61,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-functional-programming/your-code/Q1.ipynb b/module-1/lab-functional-programming/your-code/Q1.ipynb index 8b07d3db6..557d8e79f 100644 --- a/module-1/lab-functional-programming/your-code/Q1.ipynb +++ b/module-1/lab-functional-programming/your-code/Q1.ipynb @@ -19,24 +19,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# Import required libraries\n", - "\n", + "import os\n", "# Define function\n", "def get_bow_from_docs(docs, stop_words=[]):\n", - " \n", + "\n", " # In the function, first define the variables you will use such as `corpus`, `bag_of_words`, and `term_freq`.\n", - " \n", - " \n", - " \n", + " #docs = ['doc1.txt', 'doc2.txt', 'doc3.txt']\n", + " bag_of_words = ['a', 'am', 'at', 'cool', 'i', 'ironhack', 'is', 'love', 'student']\n", + " term_freq = [\n", + " [0, 0, 0, 1, 0, 1, 1, 0, 0],\n", + " [0, 0, 0, 0, 1, 1, 0, 1, 0],\n", + " [1, 1, 1, 0, 1, 1, 0, 0, 1],\n", + "] \n", " \"\"\"\n", " Loop `docs` and read the content of each doc into a string in `corpus`.\n", " Remember to convert the doc content to lowercases and remove punctuation.\n", " \"\"\"\n", - "\n", + " corpus = []\n", + " for file in docs:\n", + " lines=open(file).readline()\n", + " corpus.append(lines)\n", + " corpus=[i.replace('.','').lower() for i in corpus]\n", " \n", " \n", " \"\"\"\n", @@ -45,23 +53,26 @@ " In addition, check if each term is in the `stop_words` array. Only append the term to `bag_of_words`\n", " if it is not a stop word.\n", " \"\"\"\n", - "\n", - " \n", - " \n", + " bag_of_words = []\n", + " for str in corpus:\n", + " split_str=str.split()\n", + " for word in split_str:\n", + " if word not in bag_of_words:\n", + " if word not in stop_words:\n", + " bag_of_words.append(word)\n", + " \n", + " \n", " \n", " \"\"\"\n", " Loop `corpus` again. For each doc string, count the number of occurrences of each term in `bag_of_words`. \n", " Create an array for each doc's term frequency and append it to `term_freq`.\n", " \"\"\"\n", + " term_freq = []\n", "\n", - " \n", - " \n", - " # Now return your output as an object\n", - " return {\n", - " \"bag_of_words\": bag_of_words,\n", - " \"term_freq\": term_freq\n", - " }\n", - " " + " for sentence in corpus:\n", + " term_freq.append([(sentence.split()).count(term) for term in bag_of_words])\n", + "\n", + " return {\"bag of words\":bag_of_words,\"term_freq\":term_freq}\n" ] }, { @@ -75,13 +86,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bag of words': ['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at'], 'term_freq': [[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]}\n" + ] + } + ], "source": [ - "# Define doc paths array\n", - "docs = []\n", - "\n", + "# Define doc paths list\n", + "path = '../../lab-string-operations/your-code/'\n", + "files = ['doc1.txt', 'doc2.txt', 'doc3.txt']\n", + "docs = [path + f for f in files]\n", + "stop_words\n", "# Obtain BoW from your function\n", "bow = get_bow_from_docs(docs)\n", "\n", @@ -100,12 +121,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "frozenset({'fill', 'find', 'beside', 'became', 'besides', 'noone', 'up', 'less', 'we', 'nowhere', 'too', 'under', 'within', 'hereby', 'whereupon', 'detail', 'above', 'i', 'than', 'yourselves', 'whereafter', 'of', 'ourselves', 'thin', 'it', 'un', 'own', 'whose', 'anyway', 'amount', 'becomes', 'most', 'thick', 'me', 'side', 'only', 'bill', 'thence', 'keep', 'one', 'per', 'many', 'both', 'anyhow', 'against', 'nothing', 'this', 'wherever', 'each', 'some', 'something', 'please', 'off', 'do', 'rather', 'by', 'which', 'from', 'ltd', 'beforehand', 'along', 'everywhere', 'must', 'and', 'eg', 'their', 'without', 'even', 'but', 'if', 'my', 'whence', 'being', 'once', 'seem', 'part', 'therefore', 'either', 'indeed', 'may', 'cant', 'anywhere', 'already', 'found', 'see', 'seemed', 'ie', 'same', 'everyone', 'the', 'still', 'while', 'herself', 'three', 'was', 'over', 'during', 'much', 'what', 'these', 'for', 'has', 'not', 'onto', 'moreover', 'sometimes', 'also', 'are', 'have', 'how', 'never', 'or', 'bottom', 'via', 'themselves', 'when', 'might', 'amongst', 'she', 'ever', 'always', 'upon', 'whole', 'yourself', 'therein', 'hasnt', 'at', 'his', 'towards', 'itself', 'seeming', 'inc', 'anything', 'where', 'since', 'between', 'other', 'together', 'namely', 'twenty', 'ours', 'whereas', 'done', 'you', 'none', 'your', 'around', 'nor', 'among', 'nevertheless', 'serious', 'an', 'us', 'until', 'whither', 'name', 'whenever', 'hence', 'behind', 'call', 'take', 'more', 'her', 'former', 'whatever', 'before', 'any', 'move', 'fifty', 'others', 'throughout', 'been', 'due', 'he', 'a', 'sixty', 'below', 'out', 'latterly', 'those', 'further', 'almost', 'again', 'someone', 'thereupon', 'on', 'full', 'get', 'cannot', 'system', 'de', 'eleven', 'who', 'otherwise', 'hereafter', 'become', 'next', 'about', 'whereby', 'forty', 'hers', 'that', 'anyone', 'somewhere', 'though', 'every', 'after', 'should', 'becoming', 'elsewhere', 'except', 'enough', 'whoever', 'empty', 'him', 'top', 'made', 'put', 'now', 'eight', 'describe', 'well', 'six', 'meanwhile', 'go', 'had', 'four', 'latter', 'perhaps', 'there', 'wherein', 'can', 'seems', 'couldnt', 'nine', 'here', 'third', 'few', 'another', 'con', 'mostly', 'thereafter', 'sometime', 'whether', 'first', 'myself', 'am', 'through', 'would', 'as', 'else', 'will', 'such', 'whom', 'thereby', 'front', 'all', 'to', 'were', 'down', 'nobody', 'fire', 're', 'because', 'give', 'show', 'with', 'interest', 'be', 'is', 'why', 'several', 'very', 'mill', 'fifteen', 'least', 'formerly', 'last', 'thru', 'no', 'two', 'beyond', 'somehow', 'yet', 'its', 'cry', 'often', 'them', 'afterwards', 'although', 'then', 'they', 'sincere', 'our', 'so', 'everything', 'in', 'back', 'thus', 'however', 'twelve', 'ten', 'yours', 'could', 'etc', 'hundred', 'into', 'across', 'mine', 'amoungst', 'hereupon', 'herein', 'co', 'toward', 'alone', 'himself', 'neither', 'five'})\n" + ] + } + ], "source": [ "from sklearn.feature_extraction import stop_words\n", - "print(stop_words.ENGLISH_STOP_WORDS)" + "stop_words= stop_words.ENGLISH_STOP_WORDS\n", + "print(stop_words)" ] }, { @@ -128,13 +158,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bag of words': ['ironhack', 'cool', 'love', 'student'], 'term_freq': [[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]]}\n" + ] + } + ], "source": [ - "bow = get_bow_from_docs(bow, stop_words.ENGLISH_STOP_WORDS)\n", - "\n", - "print(bow)" + "bow2 = get_bow_from_docs(docs, stop_words)\n", + "print(bow2)" ] }, { @@ -146,6 +183,13 @@ "```{'bag_of_words': ['ironhack', 'cool', 'love', 'student'], 'term_freq': [[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]]}```" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -170,7 +214,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/module-1/lab-functional-programming/your-code/Q2.ipynb b/module-1/lab-functional-programming/your-code/Q2.ipynb index f50f442f7..96e1f9675 100644 --- a/module-1/lab-functional-programming/your-code/Q2.ipynb +++ b/module-1/lab-functional-programming/your-code/Q2.ipynb @@ -15,12 +15,60 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ + "import re\n", + "import os\n", "# Define your string handling functions below\n", - "# Minimal 3 functions\n" + "# Minimal 3 functions\n", + "#corpus=\"
  • UX/UI...Design Bootcamp? (Full-Time)