Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Douwe Osinga committed Jul 22, 2019
2 parents f01d186 + d74e755 commit 9130b78
Show file tree
Hide file tree
Showing 19 changed files with 496 additions and 7,128 deletions.
70 changes: 35 additions & 35 deletions 04.1 Collect movie data from Wikipedia.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
Expand All @@ -26,6 +18,7 @@
"import xml.sax\n",
"\n",
"import subprocess\n",
"import re\n",
"import mwparserfromhell\n",
"import json"
]
Expand Down Expand Up @@ -60,13 +53,13 @@
{
"data": {
"text/plain": [
"['20180601/',\n",
" '20180620/',\n",
" '20180701/',\n",
" '20180720/',\n",
" '20180801/',\n",
" '20180820/',\n",
" '20180901/']"
"['20190201/',\n",
" '20190220/',\n",
" '20190301/',\n",
" '20190320/',\n",
" '20190401/',\n",
" '20190420/',\n",
" '20190501/']"
]
},
"execution_count": 4,
Expand All @@ -89,7 +82,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"20180901/\n"
"20190501/\n"
]
}
],
Expand All @@ -107,16 +100,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading data from https://dumps.wikimedia.org//enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2\n",
" 7939751936/15398410099 [==============>...............] - ETA: 1:00:18"
]
"data": {
"text/plain": [
"'/home/douwe/.keras/datasets/enwiki-20190501-pages-articles.xml.bz2'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -128,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"collapsed": true
},
Expand All @@ -151,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"collapsed": true
},
Expand Down Expand Up @@ -186,10 +181,8 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"parser = xml.sax.make_parser()\n",
Expand All @@ -204,16 +197,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"with open('generated/wp_movies.ndjson', 'wt') as fout:\n",
" for movie in handler._movies:\n",
" fout.write(json.dumps(movie) + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
63 changes: 40 additions & 23 deletions 07.3 Tweet Embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import random\n",
"import twitter\n",
Expand All @@ -18,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"collapsed": true
},
Expand All @@ -34,16 +42,16 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['source', 'in_reply_to_status_id', 'favorite_count', 'place', 'created_at', 'id', 'favorited', 'retweet_count', 'in_reply_to_user_id_str', 'display_text_range', 'in_reply_to_screen_name', 'contributors', 'in_reply_to_user_id', 'truncated', 'lang', 'coordinates', 'retweeted', 'timestamp_ms', 'id_str', 'text', 'is_quote_status', 'user', 'in_reply_to_status_id_str', 'filter_level', 'entities', 'geo'])"
"dict_keys(['filter_level', 'in_reply_to_screen_name', 'user', 'coordinates', 'id_str', 'favorited', 'truncated', 'contributors', 'in_reply_to_status_id_str', 'source', 'retweet_count', 'in_reply_to_status_id', 'entities', 'extended_entities', 'lang', 'favorite_count', 'retweeted_status', 'reply_count', 'id', 'place', 'text', 'is_quote_status', 'quote_count', 'timestamp_ms', 'in_reply_to_user_id_str', 'geo', 'possibly_sensitive', 'created_at', 'retweeted', 'in_reply_to_user_id'])"
]
},
"execution_count": 13,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -62,17 +70,17 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"!\n",
"['ico', 'is', 'open', 'to', 'investors']\n",
"['louisegrimmer', 'the', 'grocery', 'dr', 'thehenryjones', 'utas', 'tasmanianretail', 'so', 'excited']\n",
"['itscaasho', 'say', 'wallahi', \"he's\", 'talking', 'to', 'a', 'dead', 'dog', 'loooool']\n"
"['narendramodi', 'best', 'wishes', 'to', 'our', 'former', 'prime', 'minister', 'shri', 'hd', 'deve', 'gowda', 'ji', 'on', 'his', 'birthday', 'i', 'pray', 'for', 'his', 'long', 'and', 'healthy', 'life']\n",
"['ty', 'lightwood', 'aristotle', 'and', 'dante', 'discover', 'the', 'secrets', 'of', 'the', 'universe', '2019']\n",
"['goaaaaaalllll', 'gabrieeeell', 'jesuss']\n"
]
}
],
Expand All @@ -90,7 +98,6 @@
" self.stream = stream\n",
"\n",
" def __iter__(self):\n",
" print('!')\n",
" count = self.tweet_count\n",
" for tweet in self.stream:\n",
" if tweet.get('lang') != 'en':\n",
Expand All @@ -116,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -128,38 +135,48 @@
}
],
"source": [
"tweets += list(TokensYielder(70000, twitter.TwitterStream(auth=auth).statuses.sample()))"
"tweets = list(TokensYielder(70000, twitter.TwitterStream(auth=auth).statuses.sample()))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model = gensim.models.Word2Vec(tweets, \n",
" workers=5,\n",
" min_count=2,\n",
" )\n",
"model.save('twitter_stream_w2v.model')"
"model.save('zoo/07/twitter_stream_w2v.model')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/douwe/checkout/deep_learning_cookbook/venv3/lib/python3.5/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('hate', 0.7243724465370178),\n",
" ('loved', 0.7227891087532043),\n",
" ('453', 0.707709789276123),\n",
" ('melanin', 0.7069753408432007),\n",
" ('appreciate', 0.696381688117981)]"
"[('miss', 0.822679877281189),\n",
" ('hope', 0.8068050146102905),\n",
" ('loved', 0.8038904666900635),\n",
" ('appreciate', 0.8034697771072388),\n",
" ('ramblingsloa', 0.8009338974952698)]"
]
},
"execution_count": 72,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
Loading

0 comments on commit 9130b78

Please sign in to comment.