In [42]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

plt.rcParams['figure.figsize'] = (15, 5)
In [43]:
dbase = pd.read_csv('/home/al/projects/twitter/jays.csv')
In [44]:
dbase.set_index('created_at', drop=False, inplace=True)
In [45]:
#dbase.index
dbase[:3]
Out[45]:
_id name text created_at source location geo screen_name
created_at
2015-03-18 22:28 ObjectID(550a34687e9aed2d7eb9c661) liz RT @SNBarryDavis: The link for my story on @rg... 2015-03-18 22:28 Twitter Web Client toronto NaN biggles14
2015-03-18 22:27 ObjectID(550a34687e9aed2d7eb9c662) Matt Elliott @BringerOfRain20 time to grow it back #TeamUni... 2015-03-18 22:27 Twitter for iPhone Sturgeon Falls, ON NaN JaysFan1019
2015-03-18 22:26 ObjectID(550a34687e9aed2d7eb9c663) Patriot Flag 2016 RT @SNBarryDavis: The link for my story on @rg... 2015-03-18 22:26 Twitter for iPhone USA NaN patriotflag2016

3 rows × 8 columns

In [46]:
#dbase.head()
In [47]:
dbase.describe()
Out[47]:
_id name text created_at source location geo screen_name
count 11361 11361 11361 11361 11361 8177 182 11361
unique 11361 3938 7032 5287 108 1767 127 4028
top ObjectID(550a34d57e9aed2d7eb9e360) Toronto BlueJays RT @MLBMeme: Twitter has been around since 200... 2015-03-15 19:58 Twitter for iPhone Toronto { "type" : "Point", "coordinates" : [ 0, 0 ] } topBlueJays
freq 1 436 236 28 3189 1301 55 436

4 rows × 8 columns

In [48]:
total_tweets = pd.Series.count(dbase['created_at'])
total_tweets
Out[48]:
11361
In [49]:
dbase1m = pd.Series.value_counts(dbase['created_at'], normalize=False, sort=True, ascending=False, bins=None)
dbase1m[:10].plot(kind='bar')
Out[49]:
<matplotlib.axes.AxesSubplot at 0x7f5d36cb5ad0>
In [50]:
#dbase1m.sort_index()
In [51]:
tweet_freq = dbase1m.sort_index()
tweet_freq.plot()
Out[51]:
<matplotlib.axes.AxesSubplot at 0x7f5d364c4990>
In [52]:
avg = int(dbase1m.mean())
avg
Out[52]:
2
In [53]:
tweeted_from = pd.Series.value_counts(dbase['location'], normalize=False, sort=True, ascending=False, bins=None)
tweeted_from[:50].plot(kind='bar')
Out[53]:
<matplotlib.axes.AxesSubplot at 0x7f5d36483f90>
In [54]:
top_tweeter = pd.Series.value_counts(dbase['name'], normalize=False, sort=True, ascending=False, bins=None)
top_tweeter_screen_name = pd.Series.value_counts(dbase['screen_name'], normalize=False, sort=True, ascending=False, bins=None)
top_tweeter_screen_name[:20].plot(kind='bar')
Out[54]:
<matplotlib.axes.AxesSubplot at 0x7f5d3636c250>
In [55]:
source_tweet = pd.Series.value_counts(dbase['source'], normalize=False, sort=True, ascending=False, bins=None)
source_tweet[:10].plot(kind='bar')
Out[55]:
<matplotlib.axes.AxesSubplot at 0x7f5d35cd88d0>
In [56]:
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
stop = stopwords.words('english')
stop2 = stopwords.words('spanish')
otherwords = ['rt', 'vs', '-', '&amp;', '', '#bluejays']
text = dbase['text']
tokens = []
for txt in text.values:
    tokens.extend([t.lower().strip(":,.") for t in txt.split()])
In [57]:
filtered_tokens = [w for w in tokens if not w in stop]
filtered_tokens_spanish = [w for w in filtered_tokens if not w in stop2]
other_filtered = [w for w in filtered_tokens_spanish if not w in otherwords]
freq_dist = nltk.FreqDist(other_filtered)
z = 20
print "The {} most common words tweeted;".format(z)
sorted(freq_dist, key=lambda key: (-freq_dist[key], key))[:z]
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if __name__ == '__main__':
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:2: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  from IPython.kernel.zmq import kernelapp as app
The 20 most common words tweeted;
Out[57]:
['#mlb',
 'jays',
 'blue',
 'game',
 '@wilnerness590',
 '@snbarrydavis',
 'stroman',
 'spring',
 'pillar',
 'castro',
 'today',
 '@sportsnet',
 'good',
 'kevin',
 '#springtraining',
 'toronto',
 '@bnicholsonsmith',
 '#jays',
 '#yankees',
 'marcus']
In [58]:
freq_dist.plot(20)
In [59]:
dbase.created_at.value_counts()
Out[59]:
2015-03-15 19:58    28
2015-03-14 13:29    22
2015-03-15 19:59    21
2015-03-14 13:25    20
2015-03-15 20:00    19
2015-03-14 13:28    19
2015-03-17 19:25    19
2015-03-17 21:58    17
2015-03-17 20:37    17
2015-03-11 09:08    16
2015-03-13 13:10    15
2015-03-17 20:36    15
2015-03-14 13:30    15
2015-03-15 20:03    15
2015-03-18 12:10    15
...
2015-03-13 11:49    1
2015-03-11 18:26    1
2015-03-18 20:04    1
2015-03-13 11:46    1
2015-03-13 11:41    1
2015-03-17 23:24    1
2015-03-12 20:06    1
2015-03-14 16:59    1
2015-03-17 15:49    1
2015-03-14 11:08    1
2015-03-14 16:55    1
2015-03-17 15:46    1
2015-03-17 15:47    1
2015-03-14 16:53    1
2015-03-17 01:36    1
Length: 5287, dtype: int64
In [60]:
source_geo = pd.Series.value_counts(dbase['geo'], normalize=False, sort=True, ascending=False, bins=None)
In [61]:
#source_geo
In [62]:
import ast
for lines in dbase['geo']:
    if type(lines) == str:
        mylines = ast.literal_eval(lines)
        games_geo_coordinates = mylines["coordinates"]
        print games_geo_coordinates[0],
        print ",",
        print games_geo_coordinates[1]
-81.55609854 , 28.33729058
-82.78712288 , 28.00395703
-82.79022017 , 27.95456699
-82.78711407 , 28.00391443
-82.78703585 , 28.00402312
-79.5704226 , 43.7306245
0 , 0
-79.35747849 , 43.64999624
0 , 0
-79.70975474 , 43.70966548
-80.9299352 , 46.5280254
-121.61116286 , 36.66878429
-79.4757658 , 43.84455257
-79.27022813 , 43.71144311
-80.9299386 , 46.5280246
0 , 0
-80.9299746 , 46.5280381
-79.27020632 , 43.71143623
-122.319586 , 49.1453094
0 , 0
0 , 0
0 , 0
0 , 0
0 , 0
0 , 0
-82.63336473 , 27.86649813
0 , 0
0 , 0
0 , 0
0 , 0
0 , 0
0 , 0
-113.5118567 , 53.5383261
-88.2272937 , 41.62348186
0 , 0
0 , 0
0 , 0
-79.39412399 , 43.68826059
0 , 0
-107.80836624 , 50.29056654
-80.9300416 , 46.5279807
0 , 0
0 , 0
0 , 0
0 , 0
0 , 0
-79.4047014 , 43.6746594
-79.3891065 , 43.64139486
-81.55596535 , 28.33765758
-81.55601259 , 28.33750402
-81.55618847 , 28.33779662
-82.78667345 , 28.00416845
-81.55607125 , 28.33754482
-81.55600408 , 28.33727055
-81.55572491 , 28.33762969
-81.55622461 , 28.33771847
-81.55615288 , 28.33744383
-81.55600746 , 28.33742601
-81.55573379 , 28.33765708
-81.55616759 , 28.33728723
-81.55619998 , 28.33758687
0 , 0
-79.37944038 , 43.65013132
-79.42882961 , 43.6697547
-81.55616389 , 28.33743261
-79.47489414 , 43.64579018
-79.38406243 , 43.66828393
-74.12450616 , 40.61301326
-79.38412896 , 43.66836265
-79.3913704 , 43.8686096
-79.87980369 , 44.74816569
-79.34399555 , 43.73409365
-81.4880278 , 28.45021294
0 , 0
-80.43498453 , 43.45453486
-79.57827642 , 43.65651445
0 , 0
0 , 0
-80.9299584 , 46.5280322
-80.9299664 , 46.5280449
-79.47957935 , 43.70131862
-80.9299677 , 46.5280439
0 , 0
0 , 0
0 , 0
-79.47833885 , 43.69856331
-80.9299694 , 46.5280433
-82.78722757 , 28.00374647
0 , 0
-82.78718121 , 28.00378953
-79.38416724 , 43.66844401
-79.77015272 , 43.22588109
0 , 0
0 , 0
-80.9299448 , 46.5280049
0 , 0
-80.9299696 , 46.5280435
-82.78709169 , 28.00365661
-80.9299704 , 46.528043
-80.9299629 , 46.5280406
0 , 0
-80.9299735 , 46.5280299
-114.09637872 , 51.03168336
-113.59464034 , 53.44875547
-80.9299674 , 46.5280459
0 , 0
-118.2775293 , 33.7870731
-80.929973 , 46.5280461
-82.7870316 , 28.00369534
0 , 0
-82.78727082 , 28.00364425
-114.03872935 , 50.94203687
-80.9299652 , 46.5280373
0 , 0
0 , 0
0 , 0
-79.3840831 , 43.670248
-82.795822 , 28.008347
-113.511999 , 53.538383
-82.78742965 , 28.00323513
-114.03850237 , 50.94217044
-79.39405125 , 43.64586383
-82.78710287 , 28.00400806
0 , 0
-73.9079268 , 40.85029299
-79.5361609 , 44.4778892
-82.82800275 , 27.98265135
-82.82802784 , 27.98260459
-83.08303918 , 40.11585069
-82.78700251 , 28.00421074
-82.78712342 , 28.00400291
-79.4420106 , 43.6540386
-83.08297568 , 40.11576073
-83.08289495 , 40.11581338
-83.0829287 , 40.11568385
-80.9299778 , 46.5280427
-82.78712564 , 28.00371357
-82.78691576 , 28.00428115
-83.08285167 , 40.11607796
-82.78711801 , 28.00373096
-83.08293676 , 40.11582282
0 , 0
0 , 0
0 , 0
0 , 0
-80.9300002 , 46.5280078
-83.08300199 , 40.11591795
-82.78758656 , 28.0037155
-82.78735522 , 28.00384156
-82.78711789 , 28.00382612
-82.78706848 , 28.00366126
-82.78619204 , 28.0037719
-113.5119617 , 53.5382353
-79.71606488 , 44.41117549
0 , 0
-82.38332189 , 27.9813011
-96.19195686 , 41.25453833
-123.1026697 , 49.28360322
-100.06262297 , 37.76654678
-100.06264154 , 37.7665168
0 , 0
0 , 0
0 , 0
-82.756316 , 28.033441
0 , 0
-99.76966739 , 37.1919417
0 , 0
-80.9407189 , 46.5272455
-80.9406609 , 46.5271437
-82.51722222 , 27.34777778
-2.17522348 , 52.19333782
-81.29888889 , 28.41583333
-81.9130623 , 26.4997814
-82.51722222 , 27.34777778
-81.9130653 , 26.4997819
-80.4718142 , 43.4611861
0 , 0
-80.9299428 , 46.5279497
-78.94666837 , 43.9134479
0 , 0
-80.929997 , 46.527999
0 , 0
In [63]:
import folium
from collections import namedtuple
filepath = "/home/al/projects/twitter/csv/jaysgeo.html"
na_coordinates = (43.641438,-79.389353) # Rogers Centre Toronto, Ontario
na_map = folium.Map(location=na_coordinates, zoom_start=4)
Location = namedtuple('Location', ['latitude', 'longitude'])
count_locations = 0

for lines in dbase['geo']:
    if type(lines) == str:
        count_locations += 1
        mylines = ast.literal_eval(lines)
        games_geo_coordinates = mylines["coordinates"]
        location = Location(games_geo_coordinates[1], games_geo_coordinates[0])
        na_map.circle_marker(location, popup=str(location))

na_map.create_map(filepath)
In [64]:
print str(count_locations) + " users reporting coordinates"
print str(total_tweets) + " total tweets"
percent = round(float(count_locations) / float(total_tweets) * 100, 2)
print str(percent) + "%"
182 users reporting coordinates
11361 total tweets
1.6%
In [84]:
from IPython.display import HTML
HTML('<iframe src=http://www.linuxnorth.org/jaysgeo.html width=960 height=530></iframe>')
Out[84]: