Merge branch 'master' of https://github.com/DOsinga/deep_learning_coo…

…kbook
DOsinga · Jul 22, 2019 · 9130b78 · 9130b78
2 parents f01d186 + d74e755
commit 9130b78
Show file tree

Hide file tree

Showing 19 changed files with 496 additions and 7,128 deletions.
diff --git a/04.1 Collect movie data from Wikipedia.ipynb b/04.1 Collect movie data from Wikipedia.ipynb
@@ -2,17 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using TensorFlow backend.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import requests\n",
     "from bs4 import BeautifulSoup\n",
@@ -26,6 +18,7 @@
     "import xml.sax\n",
     "\n",
     "import subprocess\n",
+    "import re\n",
     "import mwparserfromhell\n",
     "import json"
    ]
@@ -60,13 +53,13 @@
     {
      "data": {
       "text/plain": [
-       "['20180601/',\n",
-       " '20180620/',\n",
-       " '20180701/',\n",
-       " '20180720/',\n",
-       " '20180801/',\n",
-       " '20180820/',\n",
-       " '20180901/']"
+       "['20190201/',\n",
+       " '20190220/',\n",
+       " '20190301/',\n",
+       " '20190320/',\n",
+       " '20190401/',\n",
+       " '20190420/',\n",
+       " '20190501/']"
       ]
      },
      "execution_count": 4,
@@ -89,7 +82,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "20180901/\n"
+      "20190501/\n"
      ]
     }
    ],
@@ -107,16 +100,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading data from https://dumps.wikimedia.org//enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2\n",
-      " 7939751936/15398410099 [==============>...............] - ETA: 1:00:18"
-     ]
+     "data": {
+      "text/plain": [
+       "'/home/douwe/.keras/datasets/enwiki-20190501-pages-articles.xml.bz2'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -128,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "collapsed": true
    },
@@ -151,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {
     "collapsed": true
    },
@@ -186,10 +181,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 16,
+   "metadata": {},
    "outputs": [],
    "source": [
     "parser = xml.sax.make_parser()\n",
@@ -204,16 +197,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 20,
+   "metadata": {},
    "outputs": [],
    "source": [
     "with open('generated/wp_movies.ndjson', 'wt') as fout:\n",
     "    for movie in handler._movies:\n",
     "         fout.write(json.dumps(movie) + '\\n')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/07.3 Tweet Embeddings.ipynb b/07.3 Tweet Embeddings.ipynb
@@ -2,9 +2,17 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
    "source": [
     "import random\n",
     "import twitter\n",
@@ -18,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -34,16 +42,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "dict_keys(['source', 'in_reply_to_status_id', 'favorite_count', 'place', 'created_at', 'id', 'favorited', 'retweet_count', 'in_reply_to_user_id_str', 'display_text_range', 'in_reply_to_screen_name', 'contributors', 'in_reply_to_user_id', 'truncated', 'lang', 'coordinates', 'retweeted', 'timestamp_ms', 'id_str', 'text', 'is_quote_status', 'user', 'in_reply_to_status_id_str', 'filter_level', 'entities', 'geo'])"
+       "dict_keys(['filter_level', 'in_reply_to_screen_name', 'user', 'coordinates', 'id_str', 'favorited', 'truncated', 'contributors', 'in_reply_to_status_id_str', 'source', 'retweet_count', 'in_reply_to_status_id', 'entities', 'extended_entities', 'lang', 'favorite_count', 'retweeted_status', 'reply_count', 'id', 'place', 'text', 'is_quote_status', 'quote_count', 'timestamp_ms', 'in_reply_to_user_id_str', 'geo', 'possibly_sensitive', 'created_at', 'retweeted', 'in_reply_to_user_id'])"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -62,17 +70,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "!\n",
-      "['ico', 'is', 'open', 'to', 'investors']\n",
-      "['louisegrimmer', 'the', 'grocery', 'dr', 'thehenryjones', 'utas', 'tasmanianretail', 'so', 'excited']\n",
-      "['itscaasho', 'say', 'wallahi', \"he's\", 'talking', 'to', 'a', 'dead', 'dog', 'loooool']\n"
+      "['narendramodi', 'best', 'wishes', 'to', 'our', 'former', 'prime', 'minister', 'shri', 'hd', 'deve', 'gowda', 'ji', 'on', 'his', 'birthday', 'i', 'pray', 'for', 'his', 'long', 'and', 'healthy', 'life']\n",
+      "['ty', 'lightwood', 'aristotle', 'and', 'dante', 'discover', 'the', 'secrets', 'of', 'the', 'universe', '2019']\n",
+      "['goaaaaaalllll', 'gabrieeeell', 'jesuss']\n"
      ]
     }
    ],
@@ -90,7 +98,6 @@
     "        self.stream = stream\n",
     "\n",
     "    def __iter__(self):\n",
-    "        print('!')\n",
     "        count = self.tweet_count\n",
     "        for tweet in self.stream:\n",
     "            if tweet.get('lang') != 'en':\n",
@@ -116,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -128,38 +135,48 @@
     }
    ],
    "source": [
-    "tweets += list(TokensYielder(70000, twitter.TwitterStream(auth=auth).statuses.sample()))"
+    "tweets = list(TokensYielder(70000, twitter.TwitterStream(auth=auth).statuses.sample()))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "model = gensim.models.Word2Vec(tweets, \n",
     "                               workers=5,\n",
     "                               min_count=2,\n",
     "                              )\n",
-    "model.save('twitter_stream_w2v.model')"
+    "model.save('zoo/07/twitter_stream_w2v.model')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/douwe/checkout/deep_learning_cookbook/venv3/lib/python3.5/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "[('hate', 0.7243724465370178),\n",
-       " ('loved', 0.7227891087532043),\n",
-       " ('453', 0.707709789276123),\n",
-       " ('melanin', 0.7069753408432007),\n",
-       " ('appreciate', 0.696381688117981)]"
+       "[('miss', 0.822679877281189),\n",
+       " ('hope', 0.8068050146102905),\n",
+       " ('loved', 0.8038904666900635),\n",
+       " ('appreciate', 0.8034697771072388),\n",
+       " ('ramblingsloa', 0.8009338974952698)]"
       ]
      },
-     "execution_count": 72,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }