Python for Data Analysis

Data Wrangling with Pandas, NumPy,

and IPython

Wes McKinney

Data Loading, Storage, and File Formats¶

import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

Reading and Writing Data in Text Format¶

!more examples\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

df = pd.read_csv('examples/ex1.csv')
df

pd.read_table('examples/ex1.csv', sep=',')

!more examples\ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

pd.read_csv('examples/ex2.csv', header=None)
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

!more examples\csv_mindex.csv
parsed = pd.read_csv('examples/csv_mindex.csv',
                     index_col=['key1', 'key2'])
parsed

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

list(open('examples/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

result = pd.read_table('examples/ex3.txt', sep='\s+')
result

!more examples\ex4.csv
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

!more examples\ex5.csv
result = pd.read_csv('examples/ex5.csv')
result
pd.isnull(result)

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Reading Text Files in Pieces¶

pd.options.display.max_rows = 10

result = pd.read_csv('examples/ex6.csv')
result

pd.read_csv('examples/ex6.csv', nrows=5)

chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x28bbe433588>

chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

Writing Data to Text Format¶

data = pd.read_csv('examples/ex5.csv')
data

data.to_csv('examples\out.csv')
!more examples\out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

import sys
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0

dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
!more examples\tseries.csv

,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

Working with Delimited Formats¶

!more examples\ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"

import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']

with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

header, values = lines[0], lines[1:]

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' quoting = csv.QUOTE_MINIMAL

reader = csv.reader(f, dialect=my_dialect)

reader = csv.reader(f, delimiter='|')

with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9'))

JSON Data¶

obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

asjson = json.dumps(result)

siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

!more examples\example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]

data = pd.read_json('examples/example.json')
data

print(data.to_json())
print(data.to_json(orient='records'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

XML and HTML: Web Scraping¶

conda install lxml pip install beautifulsoup4 html5lib

tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()

close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, Length: 15, dtype: int64

Parsing XML with lxml.objectify¶

373889 Metro-North Railroad Escalator Availability Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009. 2011 12 Service Indicators M U % 1 97.00 97.00

from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

perf = pd.DataFrame(data)
perf.head()

from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

root
root.get('href')
root.text

'Google'

Binary Data Formats¶

frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')

pd.read_pickle('examples/frame_pickle')

!del examples\frame_pickle

Using HDF5 Format¶

frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

store['obj1']

store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()

frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])

import os
os.remove('mydata.h5')

Reading Microsoft Excel Files¶

xlsx = pd.ExcelFile('examples/ex1.xlsx')

pd.read_excel(xlsx, 'Sheet1')

frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

frame.to_excel('examples/ex2.xlsx')

!rm examples/ex2.xlsx

'rm' is not recognized as an internal or external command,
operable program or batch file.

Interacting with Web APIs¶

import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

<Response [200]>

data = resp.json()
#print(data)
data[0]['title']

'BUG: value_counts Int64 zero-size array to reduction'

issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

Interacting with Databases¶

import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()

---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
<ipython-input-64-7d31cf7a3bea> in <module>
      6 );"""
      7 con = sqlite3.connect('mydata.sqlite')
----> 8 con.execute(query)
      9 con.commit()

OperationalError: table test already exists

data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)

!del mydata.sqlite

D:\1MyCloudStore\MyGoogleDrive\MyWebSite\spring2020\ap\lectures\Python for Data Analysis-2ndEd-ipynbs\mydata.sqlite

The process cannot access the file because it is being used by another process.

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
...	...	...	...	...	...
9995	2.311896	-0.417070	-1.409599	-0.515821	L
9996	-0.479893	-0.650419	0.745152	-0.646038	E
9997	0.523331	0.787112	0.486066	1.093156	K
9998	-0.362559	0.598894	-1.843201	0.887292	G
9999	-0.096376	-1.012999	-0.657431	-0.573315	0

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

	Bank Name	City	ST	CERT	Acquiring Institution	Closing Date	Updated Date
0	Allied Bank	Mulberry	AR	91	Today's Bank	September 23, 2016	November 17, 2016
1	The Woodbury Banking Company	Woodbury	GA	11297	United Bank	August 19, 2016	November 17, 2016
2	First CornerStone Bank	King of Prussia	PA	35312	First-Citizens Bank & Trust Company	May 6, 2016	September 6, 2016
3	Trust Company Bank	Memphis	TN	9956	The Bank of Fayette County	April 29, 2016	September 6, 2016
4	North Milwaukee State Bank	Milwaukee	WI	20364	First-Citizens Bank & Trust Company	March 11, 2016	June 16, 2016

	AGENCY_NAME	INDICATOR_NAME	DESCRIPTION	PERIOD_YEAR	PERIOD_MONTH	CATEGORY	FREQUENCY	INDICATOR_UNIT	YTD_TARGET	YTD_ACTUAL	MONTHLY_TARGET	MONTHLY_ACTUAL
0	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	1	Service Indicators	M	%	95	96.9	95	96.9
1	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	2	Service Indicators	M	%	95	96	95	95
2	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	3	Service Indicators	M	%	95	96.3	95	96.9
3	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	4	Service Indicators	M	%	95	96.8	95	98.3
4	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	5	Service Indicators	M	%	95	96.6	95	95.8

	a
0	-0.204708
1	0.478943
2	-0.519439
3	-0.555730
4	1.965781
...	...
95	0.795253
96	0.118110
97	-0.748532
98	0.584970
99	0.152677

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

	number	title	labels	state
0	33317	BUG: value_counts Int64 zero-size array to red...	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...	open
1	33316	CLN: Added static types for `pandas/_libs/redu...	[{'id': 1280988427, 'node_id': 'MDU6TGFiZWwxMj...	open
2	33315	Remove `-Werror` from setup.py	[{'id': 129350, 'node_id': 'MDU6TGFiZWwxMjkzNT...	open
3	33314	BUG: clang-9: error: -Wl,-export_dynamic: 'lin...	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...	open
4	33313	BUG: Bad error message on read_parquet() with ...	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...	open
...	...	...	...	...
25	33281	ENH: Optimize nrows in read_excel	[{'id': 49254273, 'node_id': 'MDU6TGFiZWw0OTI1...	open
26	33279	API/CLN: simplify CategoricalBlock.replace	[{'id': 78527356, 'node_id': 'MDU6TGFiZWw3ODUy...	open
27	33278	ENH: Added schema kwarg to get_schema method	[]	open
28	33277	CLN 31942/replace appender with doc 3	[]	open
29	33276	EA interface - requirements for "hashable, val...	[{'id': 35818298, 'node_id': 'MDU6TGFiZWwzNTgx...	open

	a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12