Skip to content

Commit e1dbd31

Browse files
Merge pull request #69 from jatinpapreja/master
Corrections in Chapter 3 and Chapter 4
2 parents 83bed9c + 302abcd commit e1dbd31

File tree

4 files changed

+1067
-749
lines changed

4 files changed

+1067
-749
lines changed

Ch3/09_Visualizing_Embeddings_Using_TSNE.ipynb

Lines changed: 88 additions & 45 deletions
Large diffs are not rendered by default.

Ch4/03_Word2Vec_Example.ipynb

Lines changed: 171 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
{
44
"cell_type": "markdown",
55
"metadata": {
6-
"colab_type": "text",
76
"id": "sVtvH58nb_Hp"
87
},
98
"source": [
@@ -28,17 +27,36 @@
2827
"cell_type": "code",
2928
"execution_count": 1,
3029
"metadata": {
31-
"colab": {},
32-
"colab_type": "code",
33-
"id": "JQX8DAmBb_Hr"
30+
"colab": {
31+
"base_uri": "https://localhost:8080/"
32+
},
33+
"id": "JQX8DAmBb_Hr",
34+
"outputId": "3e55c7d1-be7c-44bf-caf8-d00cf9dab00b"
3435
},
35-
"outputs": [],
36+
"outputs": [
37+
{
38+
"name": "stdout",
39+
"output_type": "stream",
40+
"text": [
41+
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
42+
"[nltk_data] Unzipping corpora/stopwords.zip.\n",
43+
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
44+
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
45+
]
46+
}
47+
],
3648
"source": [
3749
"#basic imports\n",
3850
"import os\n",
51+
"import wget\n",
52+
"import gzip\n",
53+
"import shutil\n",
3954
"from time import time\n",
4055
"\n",
4156
"#pre-processing imports\n",
57+
"import nltk\n",
58+
"nltk.download('stopwords')\n",
59+
"nltk.download('punkt')\n",
4260
"from nltk.tokenize import word_tokenize\n",
4361
"from nltk.corpus import stopwords\n",
4462
"from string import punctuation\n",
@@ -54,91 +72,163 @@
5472
{
5573
"cell_type": "code",
5674
"execution_count": 2,
57-
"metadata": {},
58-
"outputs": [],
59-
"source": [
60-
"path = os.getcwd()\n",
61-
"path = path + '\\Data'\n",
62-
"\n",
63-
"fil = 'sentiment_sentences.txt'"
64-
]
65-
},
66-
{
67-
"cell_type": "code",
68-
"execution_count": 3,
69-
"metadata": {},
75+
"metadata": {
76+
"colab": {
77+
"base_uri": "https://localhost:8080/",
78+
"height": 140,
79+
"resources": {
80+
"http://localhost:8080/nbextensions/google.colab/files.js": {
81+
"data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK",
82+
"headers": [
83+
[
84+
"content-type",
85+
"application/javascript"
86+
]
87+
],
88+
"ok": true,
89+
"status": 200,
90+
"status_text": ""
91+
}
92+
}
93+
},
94+
"id": "S8RM8c6AS8AX",
95+
"outputId": "e6e3d6b7-bd49-4dd9-a28e-59a4193187aa"
96+
},
7097
"outputs": [
98+
{
99+
"data": {
100+
"text/html": [
101+
"\n",
102+
" <input type=\"file\" id=\"files-d0856459-a7c7-4318-a2a4-6c118d5723ab\" name=\"files[]\" multiple disabled\n",
103+
" style=\"border:none\" />\n",
104+
" <output id=\"result-d0856459-a7c7-4318-a2a4-6c118d5723ab\">\n",
105+
" Upload widget is only available when the cell has been executed in the\n",
106+
" current browser session. Please rerun this cell to enable.\n",
107+
" </output>\n",
108+
" <script src=\"/nbextensions/google.colab/files.js\"></script> "
109+
],
110+
"text/plain": [
111+
"<IPython.core.display.HTML object>"
112+
]
113+
},
114+
"metadata": {
115+
"tags": []
116+
},
117+
"output_type": "display_data"
118+
},
71119
{
72120
"name": "stdout",
73121
"output_type": "stream",
74122
"text": [
75-
"File already exists\n"
123+
"Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt\n",
124+
"Saving imdb_labelled.txt to imdb_labelled.txt\n",
125+
"Saving yelp_labelled.txt to yelp_labelled.txt\n"
76126
]
77127
}
78128
],
79129
"source": [
80-
"if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n",
81-
" file = open(os.path.join(path, fil), 'w')\n",
82-
" file.close()\n",
130+
"try:\n",
131+
" from google.colab import files\n",
83132
" \n",
84-
" # combined the three files to make sentiment_sentences.txt\n",
85-
" filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
133+
" # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt'\n",
134+
" uploaded = files.upload()\n",
135+
" \n",
136+
" !mkdir DATAPATH\n",
137+
" !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n",
138+
" !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n",
139+
" \n",
140+
"except ModuleNotFoundError:\n",
141+
"\n",
142+
" fil = 'sentiment_sentences.txt'\n",
143+
"\n",
144+
" if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n",
145+
" file = open(os.path.join(path, fil), 'w')\n",
146+
" file.close()\n",
147+
" \n",
148+
" # combined the three files to make sentiment_sentences.txt\n",
149+
" filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
86150
"\n",
87-
" with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n",
88-
" for fname in filenames:\n",
89-
" with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n",
90-
" outfile.write(infile.read())\n",
91-
" print(\"File created\")\n",
92-
"else:\n",
93-
" print(\"File already exists\")"
151+
" with open('Data/sentiment_sentences.txt', 'w') as outfile:\n",
152+
" for fname in filenames:\n",
153+
" with open('Data/sentiment labelled sentences/' + fname) as infile:\n",
154+
" outfile.write(infile.read())\n",
155+
" print(\"File created\")\n",
156+
" else:\n",
157+
" print(\"File already exists\")"
94158
]
95159
},
96160
{
97161
"cell_type": "code",
98-
"execution_count": 4,
162+
"execution_count": 3,
99163
"metadata": {
100-
"colab": {},
101-
"colab_type": "code",
164+
"colab": {
165+
"base_uri": "https://localhost:8080/"
166+
},
102167
"id": "COUGXAxcb_H5",
103-
"outputId": "f1b6d8ad-e22b-4126-d2ea-862697c4158b",
168+
"outputId": "dd37e92a-942a-49c6-aa2e-8c48e542ec1b",
104169
"scrolled": true
105170
},
106171
"outputs": [
107172
{
108173
"name": "stdout",
109174
"output_type": "stream",
110175
"text": [
111-
"Wall time: 15.7 s\n",
176+
"--2021-07-04 11:24:51-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
177+
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.99\n",
178+
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.99|:443... connected.\n",
179+
"HTTP request sent, awaiting response... 200 OK\n",
180+
"Length: 1647046227 (1.5G) [application/x-gzip]\n",
181+
"Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’\n",
182+
"\n",
183+
"GoogleNews-vectors- 100%[===================>] 1.53G 75.3MB/s in 26s \n",
184+
"\n",
185+
"2021-07-04 11:25:17 (60.2 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n",
186+
"\n",
187+
"CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs\n",
188+
"Wall time: 9.06 µs\n",
112189
"done loading Word2Vec\n"
113190
]
114191
}
115192
],
116193
"source": [
117194
"#Load the pre-trained word2vec model and the dataset\n",
118195
"try:\n",
196+
" \n",
119197
" from google.colab import files\n",
120-
" data_path= \"DATAPATH\" \n",
198+
" data_path= \"DATAPATH\"\n",
199+
" !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
200+
" !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n",
121201
" path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n",
122202
" training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n",
203+
" \n",
123204
"except ModuleNotFoundError:\n",
124-
" data_path= \"Data\" \n",
125205
" \n",
126-
" if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
127-
" if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
128-
" wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
129-
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
206+
" data_path= \"Data\"\n",
207+
" \n",
208+
" if not os.path.exists('GoogleNews-vectors-negative300.bin'):\n",
209+
" if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
210+
" if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
211+
" wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
212+
"\n",
213+
" with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:\n",
214+
" with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:\n",
215+
" shutil.copyfileobj(f_in, f_out)\n",
216+
"\n",
217+
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
218+
" else:\n",
219+
" path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
220+
"\n",
130221
" else:\n",
131-
" path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
132-
" \n",
222+
" path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
133223
" else:\n",
134-
" path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
224+
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
135225
" \n",
136226
" training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
137-
"\n",
138-
"\n",
139-
"\n",
227+
" \n",
228+
" \n",
140229
"#Load W2V model. This will take some time. \n",
141-
"%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
230+
"%time \n",
231+
"w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
142232
"print('done loading Word2Vec')\n",
143233
"\n",
144234
"#Read text data, cats.\n",
@@ -154,12 +244,13 @@
154244
},
155245
{
156246
"cell_type": "code",
157-
"execution_count": 5,
247+
"execution_count": 4,
158248
"metadata": {
159-
"colab": {},
160-
"colab_type": "code",
249+
"colab": {
250+
"base_uri": "https://localhost:8080/"
251+
},
161252
"id": "m-WjFyC6b_IE",
162-
"outputId": "5df9e11b-6f8e-42b8-e198-6fe343293cc3"
253+
"outputId": "ce75ae49-eaf7-4af9-fc70-28ad6fb984eb"
163254
},
164255
"outputs": [
165256
{
@@ -179,12 +270,13 @@
179270
},
180271
{
181272
"cell_type": "code",
182-
"execution_count": 6,
273+
"execution_count": 5,
183274
"metadata": {
184-
"colab": {},
185-
"colab_type": "code",
275+
"colab": {
276+
"base_uri": "https://localhost:8080/"
277+
},
186278
"id": "XEz30Jztb_IP",
187-
"outputId": "2169b2c9-e89f-439a-a23f-d322fb856841"
279+
"outputId": "321faadb-db0a-4d2b-9c4f-a504f29accf7"
188280
},
189281
"outputs": [
190282
{
@@ -207,12 +299,13 @@
207299
},
208300
{
209301
"cell_type": "code",
210-
"execution_count": 7,
302+
"execution_count": 6,
211303
"metadata": {
212-
"colab": {},
213-
"colab_type": "code",
304+
"colab": {
305+
"base_uri": "https://localhost:8080/"
306+
},
214307
"id": "MFOGaDTwb_Ig",
215-
"outputId": "7603e297-9167-43ec-c7da-46d82dc850ad"
308+
"outputId": "ccaf5749-fff8-440e-8709-026b1394afc1"
216309
},
217310
"outputs": [
218311
{
@@ -245,12 +338,13 @@
245338
},
246339
{
247340
"cell_type": "code",
248-
"execution_count": 8,
341+
"execution_count": 7,
249342
"metadata": {
250-
"colab": {},
251-
"colab_type": "code",
343+
"colab": {
344+
"base_uri": "https://localhost:8080/"
345+
},
252346
"id": "fXRiGtY1b_Iq",
253-
"outputId": "2d57a96f-8da8-4285-ca1e-2c617578b9e1"
347+
"outputId": "2edbe27b-0400-4df7-f1f0-8b0d20549892"
254348
},
255349
"outputs": [
256350
{
@@ -287,28 +381,29 @@
287381
},
288382
{
289383
"cell_type": "code",
290-
"execution_count": 9,
384+
"execution_count": 8,
291385
"metadata": {
292-
"colab": {},
293-
"colab_type": "code",
386+
"colab": {
387+
"base_uri": "https://localhost:8080/"
388+
},
294389
"id": "mr9IaQppb_Ix",
295-
"outputId": "13a84b5c-fde3-49f4-b156-5c2f36592b19"
390+
"outputId": "0d1c168d-daac-40c9-b725-98a6406bc309"
296391
},
297392
"outputs": [
298393
{
299394
"name": "stdout",
300395
"output_type": "stream",
301396
"text": [
302-
"Accuracy: 0.8173333333333334\n",
397+
"Accuracy: 0.816\n",
303398
" precision recall f1-score support\n",
304399
"\n",
305400
" 0\n",
306-
" 0.79 0.82 0.81 350\n",
401+
" 0.84 0.81 0.83 404\n",
307402
" 1\n",
308-
" 0.84 0.81 0.83 400\n",
403+
" 0.79 0.82 0.80 346\n",
309404
"\n",
310405
" accuracy 0.82 750\n",
311-
" macro avg 0.82 0.82 0.82 750\n",
406+
" macro avg 0.81 0.82 0.82 750\n",
312407
"weighted avg 0.82 0.82 0.82 750\n",
313408
"\n"
314409
]
@@ -327,7 +422,6 @@
327422
{
328423
"cell_type": "markdown",
329424
"metadata": {
330-
"colab_type": "text",
331425
"id": "k7wjLB8rb_JB"
332426
},
333427
"source": [
@@ -337,7 +431,8 @@
337431
],
338432
"metadata": {
339433
"colab": {
340-
"name": "Word2Vec_Example.ipynb",
434+
"collapsed_sections": [],
435+
"name": "03_Word2Vec_Example.ipynb",
341436
"provenance": []
342437
},
343438
"kernelspec": {
@@ -355,7 +450,7 @@
355450
"name": "python",
356451
"nbconvert_exporter": "python",
357452
"pygments_lexer": "ipython3",
358-
"version": "3.7.0"
453+
"version": "3.7.4"
359454
}
360455
},
361456
"nbformat": 4,

0 commit comments

Comments
 (0)