Data Analysis with Python (Useful Tips)

复制与引用

1
2
3
4
5
6
7
8
9
10
# a = b 赋值时,创建对象的新引用
# 不可变对象(数字和字符串),创建副本
# 可变对象(list 和 dict),创建引用,行为会有变化,危险
# 浅复制
a = [1,2,3,4]
b = list(a) # 共有元素部分会发生关联,危险

# 深复制
import copy
b = copy.deepcopy(a)
1
2
3
4
line        = "GOOD,100,490.10"
types       = [str,int,float]
raw_fields  = line.split(',')
fields      = [ty(vl) for ty,vl in zip(types,raw_field)]

collections

1
2
3
4
5
from collections import defaultdict
counts = defaultdict(int) # values will initialize to 0

from collections import Counter
counts = Counter(list)   # list 频数统计

pandas

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from pandas import DataFrame, Series
import pandas as pd
frame = DataFrame(records)
results = Series([x.split()[0] for x in frame.a.dropna()])
results.value_counts()[:8]

import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] 
users = pd.read_table('ml-1m/users.dat', sep='::', header=None,
names=unames)

data = pd.merge(pd.merge(ratings, users), movies)

names1880 = pd.read_csv('names/yob1880.txt', names=['name', 'sex', 'births'])
names1880.groupby('sex').births.sum()

numpy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)

data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2.ndim
arr2.shape
arr2.dtype


np.zeros(10)
np.zeros((2,3))
np.arange(15)

### Linear Algebra
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
x.dot(y) # equivalently np.dot(x, y)

from numpy.linalg import inv, qr
X = randn(5, 5)
mat = X.T.dot(X)

inv(mat)
q, r = qr(mat)

pandas

1
2
3
4
5
6
7
8
9
10
11
12
13
import pandas as pd
#### Series
pd.Series([4, 7, -5, 3])
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2 = pd.Series(adict)

#### DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)

frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five'])