|
3 | 3 | {
|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {
|
6 |
| - "colab_type": "text", |
7 | 6 | "id": "sVtvH58nb_Hp"
|
8 | 7 | },
|
9 | 8 | "source": [
|
|
28 | 27 | "cell_type": "code",
|
29 | 28 | "execution_count": 1,
|
30 | 29 | "metadata": {
|
31 |
| - "colab": {}, |
32 |
| - "colab_type": "code", |
33 |
| - "id": "JQX8DAmBb_Hr" |
| 30 | + "colab": { |
| 31 | + "base_uri": "https://localhost:8080/" |
| 32 | + }, |
| 33 | + "id": "JQX8DAmBb_Hr", |
| 34 | + "outputId": "3e55c7d1-be7c-44bf-caf8-d00cf9dab00b" |
34 | 35 | },
|
35 |
| - "outputs": [], |
| 36 | + "outputs": [ |
| 37 | + { |
| 38 | + "name": "stdout", |
| 39 | + "output_type": "stream", |
| 40 | + "text": [ |
| 41 | + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", |
| 42 | + "[nltk_data] Unzipping corpora/stopwords.zip.\n", |
| 43 | + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", |
| 44 | + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" |
| 45 | + ] |
| 46 | + } |
| 47 | + ], |
36 | 48 | "source": [
|
37 | 49 | "#basic imports\n",
|
38 | 50 | "import os\n",
|
| 51 | + "import wget\n", |
| 52 | + "import gzip\n", |
| 53 | + "import shutil\n", |
39 | 54 | "from time import time\n",
|
40 | 55 | "\n",
|
41 | 56 | "#pre-processing imports\n",
|
| 57 | + "import nltk\n", |
| 58 | + "nltk.download('stopwords')\n", |
| 59 | + "nltk.download('punkt')\n", |
42 | 60 | "from nltk.tokenize import word_tokenize\n",
|
43 | 61 | "from nltk.corpus import stopwords\n",
|
44 | 62 | "from string import punctuation\n",
|
|
54 | 72 | {
|
55 | 73 | "cell_type": "code",
|
56 | 74 | "execution_count": 2,
|
57 |
| - "metadata": {}, |
58 |
| - "outputs": [], |
59 |
| - "source": [ |
60 |
| - "path = os.getcwd()\n", |
61 |
| - "path = path + '\\Data'\n", |
62 |
| - "\n", |
63 |
| - "fil = 'sentiment_sentences.txt'" |
64 |
| - ] |
65 |
| - }, |
66 |
| - { |
67 |
| - "cell_type": "code", |
68 |
| - "execution_count": 3, |
69 |
| - "metadata": {}, |
| 75 | + "metadata": { |
| 76 | + "colab": { |
| 77 | + "base_uri": "https://localhost:8080/", |
| 78 | + "height": 140, |
| 79 | + "resources": { |
| 80 | + "http://localhost:8080/nbextensions/google.colab/files.js": { |
| 81 | + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", |
| 82 | + "headers": [ |
| 83 | + [ |
| 84 | + "content-type", |
| 85 | + "application/javascript" |
| 86 | + ] |
| 87 | + ], |
| 88 | + "ok": true, |
| 89 | + "status": 200, |
| 90 | + "status_text": "" |
| 91 | + } |
| 92 | + } |
| 93 | + }, |
| 94 | + "id": "S8RM8c6AS8AX", |
| 95 | + "outputId": "e6e3d6b7-bd49-4dd9-a28e-59a4193187aa" |
| 96 | + }, |
70 | 97 | "outputs": [
|
| 98 | + { |
| 99 | + "data": { |
| 100 | + "text/html": [ |
| 101 | + "\n", |
| 102 | + " <input type=\"file\" id=\"files-d0856459-a7c7-4318-a2a4-6c118d5723ab\" name=\"files[]\" multiple disabled\n", |
| 103 | + " style=\"border:none\" />\n", |
| 104 | + " <output id=\"result-d0856459-a7c7-4318-a2a4-6c118d5723ab\">\n", |
| 105 | + " Upload widget is only available when the cell has been executed in the\n", |
| 106 | + " current browser session. Please rerun this cell to enable.\n", |
| 107 | + " </output>\n", |
| 108 | + " <script src=\"/nbextensions/google.colab/files.js\"></script> " |
| 109 | + ], |
| 110 | + "text/plain": [ |
| 111 | + "<IPython.core.display.HTML object>" |
| 112 | + ] |
| 113 | + }, |
| 114 | + "metadata": { |
| 115 | + "tags": [] |
| 116 | + }, |
| 117 | + "output_type": "display_data" |
| 118 | + }, |
71 | 119 | {
|
72 | 120 | "name": "stdout",
|
73 | 121 | "output_type": "stream",
|
74 | 122 | "text": [
|
75 |
| - "File already exists\n" |
| 123 | + "Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt\n", |
| 124 | + "Saving imdb_labelled.txt to imdb_labelled.txt\n", |
| 125 | + "Saving yelp_labelled.txt to yelp_labelled.txt\n" |
76 | 126 | ]
|
77 | 127 | }
|
78 | 128 | ],
|
79 | 129 | "source": [
|
80 |
| - "if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n", |
81 |
| - " file = open(os.path.join(path, fil), 'w')\n", |
82 |
| - " file.close()\n", |
| 130 | + "try:\n", |
| 131 | + " from google.colab import files\n", |
83 | 132 | " \n",
|
84 |
| - " # combined the three files to make sentiment_sentences.txt\n", |
85 |
| - " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", |
| 133 | + " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt'\n", |
| 134 | + " uploaded = files.upload()\n", |
| 135 | + " \n", |
| 136 | + " !mkdir DATAPATH\n", |
| 137 | + " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", |
| 138 | + " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", |
| 139 | + " \n", |
| 140 | + "except ModuleNotFoundError:\n", |
| 141 | + "\n", |
| 142 | + " fil = 'sentiment_sentences.txt'\n", |
| 143 | + "\n", |
| 144 | + " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", |
| 145 | + " file = open(os.path.join(path, fil), 'w')\n", |
| 146 | + " file.close()\n", |
| 147 | + " \n", |
| 148 | + " # combined the three files to make sentiment_sentences.txt\n", |
| 149 | + " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", |
86 | 150 | "\n",
|
87 |
| - " with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n", |
88 |
| - " for fname in filenames:\n", |
89 |
| - " with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n", |
90 |
| - " outfile.write(infile.read())\n", |
91 |
| - " print(\"File created\")\n", |
92 |
| - "else:\n", |
93 |
| - " print(\"File already exists\")" |
| 151 | + " with open('Data/sentiment_sentences.txt', 'w') as outfile:\n", |
| 152 | + " for fname in filenames:\n", |
| 153 | + " with open('Data/sentiment labelled sentences/' + fname) as infile:\n", |
| 154 | + " outfile.write(infile.read())\n", |
| 155 | + " print(\"File created\")\n", |
| 156 | + " else:\n", |
| 157 | + " print(\"File already exists\")" |
94 | 158 | ]
|
95 | 159 | },
|
96 | 160 | {
|
97 | 161 | "cell_type": "code",
|
98 |
| - "execution_count": 4, |
| 162 | + "execution_count": 3, |
99 | 163 | "metadata": {
|
100 |
| - "colab": {}, |
101 |
| - "colab_type": "code", |
| 164 | + "colab": { |
| 165 | + "base_uri": "https://localhost:8080/" |
| 166 | + }, |
102 | 167 | "id": "COUGXAxcb_H5",
|
103 |
| - "outputId": "f1b6d8ad-e22b-4126-d2ea-862697c4158b", |
| 168 | + "outputId": "dd37e92a-942a-49c6-aa2e-8c48e542ec1b", |
104 | 169 | "scrolled": true
|
105 | 170 | },
|
106 | 171 | "outputs": [
|
107 | 172 | {
|
108 | 173 | "name": "stdout",
|
109 | 174 | "output_type": "stream",
|
110 | 175 | "text": [
|
111 |
| - "Wall time: 15.7 s\n", |
| 176 | + "--2021-07-04 11:24:51-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", |
| 177 | + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.99\n", |
| 178 | + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.99|:443... connected.\n", |
| 179 | + "HTTP request sent, awaiting response... 200 OK\n", |
| 180 | + "Length: 1647046227 (1.5G) [application/x-gzip]\n", |
| 181 | + "Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’\n", |
| 182 | + "\n", |
| 183 | + "GoogleNews-vectors- 100%[===================>] 1.53G 75.3MB/s in 26s \n", |
| 184 | + "\n", |
| 185 | + "2021-07-04 11:25:17 (60.2 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n", |
| 186 | + "\n", |
| 187 | + "CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs\n", |
| 188 | + "Wall time: 9.06 µs\n", |
112 | 189 | "done loading Word2Vec\n"
|
113 | 190 | ]
|
114 | 191 | }
|
115 | 192 | ],
|
116 | 193 | "source": [
|
117 | 194 | "#Load the pre-trained word2vec model and the dataset\n",
|
118 | 195 | "try:\n",
|
| 196 | + " \n", |
119 | 197 | " from google.colab import files\n",
|
120 |
| - " data_path= \"DATAPATH\" \n", |
| 198 | + " data_path= \"DATAPATH\"\n", |
| 199 | + " !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", |
| 200 | + " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n", |
121 | 201 | " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n",
|
122 | 202 | " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n",
|
| 203 | + " \n", |
123 | 204 | "except ModuleNotFoundError:\n",
|
124 |
| - " data_path= \"Data\" \n", |
125 | 205 | " \n",
|
126 |
| - " if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n", |
127 |
| - " if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n", |
128 |
| - " wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n", |
129 |
| - " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", |
| 206 | + " data_path= \"Data\"\n", |
| 207 | + " \n", |
| 208 | + " if not os.path.exists('GoogleNews-vectors-negative300.bin'):\n", |
| 209 | + " if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n", |
| 210 | + " if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n", |
| 211 | + " wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n", |
| 212 | + "\n", |
| 213 | + " with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:\n", |
| 214 | + " with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:\n", |
| 215 | + " shutil.copyfileobj(f_in, f_out)\n", |
| 216 | + "\n", |
| 217 | + " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", |
| 218 | + " else:\n", |
| 219 | + " path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n", |
| 220 | + "\n", |
130 | 221 | " else:\n",
|
131 |
| - " path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n", |
132 |
| - " \n", |
| 222 | + " path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n", |
133 | 223 | " else:\n",
|
134 |
| - " path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n", |
| 224 | + " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", |
135 | 225 | " \n",
|
136 | 226 | " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
|
137 |
| - "\n", |
138 |
| - "\n", |
139 |
| - "\n", |
| 227 | + " \n", |
| 228 | + " \n", |
140 | 229 | "#Load W2V model. This will take some time. \n",
|
141 |
| - "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", |
| 230 | + "%time \n", |
| 231 | + "w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", |
142 | 232 | "print('done loading Word2Vec')\n",
|
143 | 233 | "\n",
|
144 | 234 | "#Read text data, cats.\n",
|
|
154 | 244 | },
|
155 | 245 | {
|
156 | 246 | "cell_type": "code",
|
157 |
| - "execution_count": 5, |
| 247 | + "execution_count": 4, |
158 | 248 | "metadata": {
|
159 |
| - "colab": {}, |
160 |
| - "colab_type": "code", |
| 249 | + "colab": { |
| 250 | + "base_uri": "https://localhost:8080/" |
| 251 | + }, |
161 | 252 | "id": "m-WjFyC6b_IE",
|
162 |
| - "outputId": "5df9e11b-6f8e-42b8-e198-6fe343293cc3" |
| 253 | + "outputId": "ce75ae49-eaf7-4af9-fc70-28ad6fb984eb" |
163 | 254 | },
|
164 | 255 | "outputs": [
|
165 | 256 | {
|
|
179 | 270 | },
|
180 | 271 | {
|
181 | 272 | "cell_type": "code",
|
182 |
| - "execution_count": 6, |
| 273 | + "execution_count": 5, |
183 | 274 | "metadata": {
|
184 |
| - "colab": {}, |
185 |
| - "colab_type": "code", |
| 275 | + "colab": { |
| 276 | + "base_uri": "https://localhost:8080/" |
| 277 | + }, |
186 | 278 | "id": "XEz30Jztb_IP",
|
187 |
| - "outputId": "2169b2c9-e89f-439a-a23f-d322fb856841" |
| 279 | + "outputId": "321faadb-db0a-4d2b-9c4f-a504f29accf7" |
188 | 280 | },
|
189 | 281 | "outputs": [
|
190 | 282 | {
|
|
207 | 299 | },
|
208 | 300 | {
|
209 | 301 | "cell_type": "code",
|
210 |
| - "execution_count": 7, |
| 302 | + "execution_count": 6, |
211 | 303 | "metadata": {
|
212 |
| - "colab": {}, |
213 |
| - "colab_type": "code", |
| 304 | + "colab": { |
| 305 | + "base_uri": "https://localhost:8080/" |
| 306 | + }, |
214 | 307 | "id": "MFOGaDTwb_Ig",
|
215 |
| - "outputId": "7603e297-9167-43ec-c7da-46d82dc850ad" |
| 308 | + "outputId": "ccaf5749-fff8-440e-8709-026b1394afc1" |
216 | 309 | },
|
217 | 310 | "outputs": [
|
218 | 311 | {
|
|
245 | 338 | },
|
246 | 339 | {
|
247 | 340 | "cell_type": "code",
|
248 |
| - "execution_count": 8, |
| 341 | + "execution_count": 7, |
249 | 342 | "metadata": {
|
250 |
| - "colab": {}, |
251 |
| - "colab_type": "code", |
| 343 | + "colab": { |
| 344 | + "base_uri": "https://localhost:8080/" |
| 345 | + }, |
252 | 346 | "id": "fXRiGtY1b_Iq",
|
253 |
| - "outputId": "2d57a96f-8da8-4285-ca1e-2c617578b9e1" |
| 347 | + "outputId": "2edbe27b-0400-4df7-f1f0-8b0d20549892" |
254 | 348 | },
|
255 | 349 | "outputs": [
|
256 | 350 | {
|
|
287 | 381 | },
|
288 | 382 | {
|
289 | 383 | "cell_type": "code",
|
290 |
| - "execution_count": 9, |
| 384 | + "execution_count": 8, |
291 | 385 | "metadata": {
|
292 |
| - "colab": {}, |
293 |
| - "colab_type": "code", |
| 386 | + "colab": { |
| 387 | + "base_uri": "https://localhost:8080/" |
| 388 | + }, |
294 | 389 | "id": "mr9IaQppb_Ix",
|
295 |
| - "outputId": "13a84b5c-fde3-49f4-b156-5c2f36592b19" |
| 390 | + "outputId": "0d1c168d-daac-40c9-b725-98a6406bc309" |
296 | 391 | },
|
297 | 392 | "outputs": [
|
298 | 393 | {
|
299 | 394 | "name": "stdout",
|
300 | 395 | "output_type": "stream",
|
301 | 396 | "text": [
|
302 |
| - "Accuracy: 0.8173333333333334\n", |
| 397 | + "Accuracy: 0.816\n", |
303 | 398 | " precision recall f1-score support\n",
|
304 | 399 | "\n",
|
305 | 400 | " 0\n",
|
306 |
| - " 0.79 0.82 0.81 350\n", |
| 401 | + " 0.84 0.81 0.83 404\n", |
307 | 402 | " 1\n",
|
308 |
| - " 0.84 0.81 0.83 400\n", |
| 403 | + " 0.79 0.82 0.80 346\n", |
309 | 404 | "\n",
|
310 | 405 | " accuracy 0.82 750\n",
|
311 |
| - " macro avg 0.82 0.82 0.82 750\n", |
| 406 | + " macro avg 0.81 0.82 0.82 750\n", |
312 | 407 | "weighted avg 0.82 0.82 0.82 750\n",
|
313 | 408 | "\n"
|
314 | 409 | ]
|
|
327 | 422 | {
|
328 | 423 | "cell_type": "markdown",
|
329 | 424 | "metadata": {
|
330 |
| - "colab_type": "text", |
331 | 425 | "id": "k7wjLB8rb_JB"
|
332 | 426 | },
|
333 | 427 | "source": [
|
|
337 | 431 | ],
|
338 | 432 | "metadata": {
|
339 | 433 | "colab": {
|
340 |
| - "name": "Word2Vec_Example.ipynb", |
| 434 | + "collapsed_sections": [], |
| 435 | + "name": "03_Word2Vec_Example.ipynb", |
341 | 436 | "provenance": []
|
342 | 437 | },
|
343 | 438 | "kernelspec": {
|
|
355 | 450 | "name": "python",
|
356 | 451 | "nbconvert_exporter": "python",
|
357 | 452 | "pygments_lexer": "ipython3",
|
358 |
| - "version": "3.7.0" |
| 453 | + "version": "3.7.4" |
359 | 454 | }
|
360 | 455 | },
|
361 | 456 | "nbformat": 4,
|
|
0 commit comments