Python for Data Analysis

Data Wrangling with Pandas, NumPy,

and IPython

 

Wes McKinney

 

https://github.com/wesm/pydata-book

 

Data Loading, Storage, and File Formats

In [1]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

Reading and Writing Data in Text Format

In [2]:
!more examples\ex1.csv
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
In [3]:
df = pd.read_csv('examples/ex1.csv')
df
Out[3]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [4]:
pd.read_table('examples/ex1.csv', sep=',')
Out[4]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [7]:
!more examples\ex2.csv
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
In [10]:
pd.read_csv('examples/ex2.csv', header=None)
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
Out[10]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [11]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')
Out[11]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [13]:
!more examples\csv_mindex.csv
parsed = pd.read_csv('examples/csv_mindex.csv',
                     index_col=['key1', 'key2'])
parsed
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
Out[13]:
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
In [14]:
list(open('examples/ex3.txt'))
Out[14]:
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
In [15]:
result = pd.read_table('examples/ex3.txt', sep='\s+')
result
Out[15]:
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
In [17]:
!more examples\ex4.csv
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
Out[17]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [18]:
!more examples\ex5.csv
result = pd.read_csv('examples/ex5.csv')
result
pd.isnull(result)
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
Out[18]:
something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False
In [19]:
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result
Out[19]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [20]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)
Out[20]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 NaN

Reading Text Files in Pieces

In [21]:
pd.options.display.max_rows = 10
In [22]:
result = pd.read_csv('examples/ex6.csv')
result
Out[22]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
... ... ... ... ... ...
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0

10000 rows × 5 columns

In [18]:
pd.read_csv('examples/ex6.csv', nrows=5)
Out[18]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
In [19]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker
Out[19]:
<pandas.io.parsers.TextFileReader at 0x28bbe433588>
In [20]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
In [21]:
tot[:10]
Out[21]:
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

Writing Data to Text Format

In [23]:
data = pd.read_csv('examples/ex5.csv')
data
Out[23]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [24]:
data.to_csv('examples\out.csv')
!more examples\out.csv
,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo
In [26]:
import sys
data.to_csv(sys.stdout, sep='|')
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
In [27]:
data.to_csv(sys.stdout, na_rep='NULL')
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
In [27]:
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
In [28]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
a,b,c
1,2,3.0
5,6,
9,10,11.0
In [29]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
!more examples\tseries.csv
,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

Working with Delimited Formats

In [30]:
!more examples\ex7.csv
"a","b","c"
"1","2","3"
"1","2","3"
In [31]:
import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)
In [32]:
for line in reader:
    print(line)
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
In [33]:
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))
In [35]:
header, values = lines[0], lines[1:]
In [36]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
Out[36]:
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' quoting = csv.QUOTE_MINIMAL

reader = csv.reader(f, dialect=my_dialect)

reader = csv.reader(f, delimiter='|')

with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9'))

JSON Data

In [37]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
In [39]:
import json
result = json.loads(obj)
result
Out[39]:
{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}
In [40]:
asjson = json.dumps(result)
In [41]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings
Out[41]:
name age
0 Scott 30
1 Katie 38
In [42]:
!more examples\example.json
[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]
In [43]:
data = pd.read_json('examples/example.json')
data
Out[43]:
a b c
0 1 2 3
1 4 5 6
2 7 8 9
In [45]:
print(data.to_json())
print(data.to_json(orient='records'))
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

XML and HTML: Web Scraping

conda install lxml pip install beautifulsoup4 html5lib

In [46]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()
Out[46]:
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date
0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
In [44]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()
Out[44]:
2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, Length: 15, dtype: int64

Parsing XML with lxml.objectify

373889 Metro-North Railroad Escalator Availability Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009. 2011 12 Service Indicators M U % 1 97.00 97.00
In [47]:
from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
In [48]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
In [47]:
perf = pd.DataFrame(data)
perf.head()
Out[47]:
AGENCY_NAME INDICATOR_NAME DESCRIPTION PERIOD_YEAR PERIOD_MONTH CATEGORY FREQUENCY INDICATOR_UNIT YTD_TARGET YTD_ACTUAL MONTHLY_TARGET MONTHLY_ACTUAL
0 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 1 Service Indicators M % 95 96.9 95 96.9
1 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 2 Service Indicators M % 95 96 95 95
2 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 3 Service Indicators M % 95 96.3 95 96.9
3 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 4 Service Indicators M % 95 96.8 95 98.3
4 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 5 Service Indicators M % 95 96.6 95 95.8
In [49]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()
In [50]:
root
root.get('href')
root.text
Out[50]:
'Google'

Binary Data Formats

In [51]:
frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')
In [52]:
pd.read_pickle('examples/frame_pickle')
Out[52]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [57]:
!del examples\frame_pickle

Using HDF5 Format

In [58]:
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
Out[58]:
<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
In [59]:
store['obj1']
Out[59]:
a
0 -0.204708
1 0.478943
2 -0.519439
3 -0.555730
4 1.965781
... ...
95 0.795253
96 0.118110
97 -0.748532
98 0.584970
99 0.152677

100 rows × 1 columns

In [60]:
store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()
In [61]:
frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])
Out[61]:
a
0 -0.204708
1 0.478943
2 -0.519439
3 -0.555730
4 1.965781
In [63]:
import os
os.remove('mydata.h5')

Reading Microsoft Excel Files

In [54]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')
In [65]:
pd.read_excel(xlsx, 'Sheet1')
Out[65]:
Unnamed: 0 a b c d message
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo
In [66]:
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame
Out[66]:
Unnamed: 0 a b c d message
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo
In [55]:
writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()
In [68]:
frame.to_excel('examples/ex2.xlsx')
In [69]:
!rm examples/ex2.xlsx
'rm' is not recognized as an internal or external command,
operable program or batch file.

Interacting with Web APIs

In [56]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp
Out[56]:
<Response [200]>
In [59]:
data = resp.json()
#print(data)
data[0]['title']
Out[59]:
'BUG: value_counts Int64 zero-size array to reduction'
In [60]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues
Out[60]:
number title labels state
0 33317 BUG: value_counts Int64 zero-size array to red... [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=... open
1 33316 CLN: Added static types for `pandas/_libs/redu... [{'id': 1280988427, 'node_id': 'MDU6TGFiZWwxMj... open
2 33315 Remove `-Werror` from setup.py [{'id': 129350, 'node_id': 'MDU6TGFiZWwxMjkzNT... open
3 33314 BUG: clang-9: error: -Wl,-export_dynamic: 'lin... [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=... open
4 33313 BUG: Bad error message on read_parquet() with ... [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=... open
... ... ... ... ...
25 33281 ENH: Optimize nrows in read_excel [{'id': 49254273, 'node_id': 'MDU6TGFiZWw0OTI1... open
26 33279 API/CLN: simplify CategoricalBlock.replace [{'id': 78527356, 'node_id': 'MDU6TGFiZWw3ODUy... open
27 33278 ENH: Added schema kwarg to get_schema method [] open
28 33277 CLN 31942/replace appender with doc 3 [] open
29 33276 EA interface - requirements for "hashable, val... [{'id': 35818298, 'node_id': 'MDU6TGFiZWwzNTgx... open

30 rows × 4 columns

Interacting with Databases

In [64]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
<ipython-input-64-7d31cf7a3bea> in <module>
      6 );"""
      7 con = sqlite3.connect('mydata.sqlite')
----> 8 con.execute(query)
      9 con.commit()

OperationalError: table test already exists
In [74]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
In [75]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
Out[75]:
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]
In [76]:
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])
Out[76]:
a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
In [77]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)
Out[77]:
a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
In [65]:
!del mydata.sqlite
D:\1MyCloudStore\MyGoogleDrive\MyWebSite\spring2020\ap\lectures\Python for Data Analysis-2ndEd-ipynbs\mydata.sqlite
The process cannot access the file because it is being used by another process.

Conclusion