规整数据:连接、合并、重构、转换etc
三、数据规整-连接-含索引
import pandas as pdimport numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : ['sfd','fdsf','we',24,3253,234,23]})df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2' : np.random.randint(0,10,3)})print (df_obj1)print (df_obj2)
key data10 b sfd1 b fdsf2 a we3 c 244 a 32535 a 2346 b 23 key data20 a 01 b 62 d 6
merge和on
默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)
|
key |
data1 |
data2 |
| 0 |
b |
sfd |
6 |
| 1 |
b |
fdsf |
6 |
| 2 |
b |
23 |
6 |
| 3 |
a |
we |
0 |
| 4 |
a |
3253 |
0 |
| 5 |
a |
234 |
0 |
# on显示指定“外键”pd.merge(df_obj1, df_obj2, on='key')
|
key |
data1 |
data2 |
| 0 |
b |
sfd |
6 |
| 1 |
b |
fdsf |
6 |
| 2 |
b |
23 |
6 |
| 3 |
a |
we |
0 |
| 4 |
a |
3253 |
0 |
| 5 |
a |
234 |
0 |
left-on和right-on
# left_on,right_on分别指定左侧数据和右侧数据的“外键”# 更改列名df_obj1 = df_obj1.rename(columns={'key':'key1'})df_obj2 = df_obj2.rename(columns={'key':'key2'})
print(df_obj1)print(df_obj2)
key1 data10 b sfd1 b fdsf2 a we3 c 244 a 32535 a 2346 b 23 key2 data20 a 01 b 62 d 6
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
|
key1 |
data1 |
key2 |
data2 |
| 0 |
b |
sfd |
b |
6 |
| 1 |
b |
fdsf |
b |
6 |
| 2 |
b |
23 |
b |
6 |
| 3 |
a |
we |
a |
0 |
| 4 |
a |
3253 |
a |
0 |
| 5 |
a |
234 |
a |
0 |
how
# “外连接”pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
|
key1 |
data1 |
key2 |
data2 |
| 0 |
b |
sfd |
b |
6.0 |
| 1 |
b |
fdsf |
b |
6.0 |
| 2 |
b |
23 |
b |
6.0 |
| 3 |
a |
we |
a |
0.0 |
| 4 |
a |
3253 |
a |
0.0 |
| 5 |
a |
234 |
a |
0.0 |
| 6 |
c |
24 |
NaN |
NaN |
| 7 |
NaN |
NaN |
d |
6.0 |
# 左连接pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
|
key1 |
data1 |
key2 |
data2 |
| 0 |
b |
sfd |
b |
6.0 |
| 1 |
b |
fdsf |
b |
6.0 |
| 2 |
a |
we |
a |
0.0 |
| 3 |
c |
24 |
NaN |
NaN |
| 4 |
a |
3253 |
a |
0.0 |
| 5 |
a |
234 |
a |
0.0 |
| 6 |
b |
23 |
b |
6.0 |
# 右连接pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
|
key1 |
data1 |
key2 |
data2 |
| 0 |
b |
sfd |
b |
6 |
| 1 |
b |
fdsf |
b |
6 |
| 2 |
b |
23 |
b |
6 |
| 3 |
a |
we |
a |
0 |
| 4 |
a |
3253 |
a |
0 |
| 5 |
a |
234 |
a |
0 |
| 6 |
NaN |
NaN |
d |
6 |
处理重复列名suffixes
# 处理重复列名df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data' : np.random.randint(0,10,7)})df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data' : np.random.randint(0,10,3)})pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
|
key |
data_left |
data_right |
| 0 |
b |
9 |
1 |
| 1 |
b |
1 |
1 |
| 2 |
b |
6 |
1 |
| 3 |
a |
7 |
1 |
| 4 |
a |
3 |
1 |
| 5 |
a |
4 |
1 |
# 按索引连接df_obj3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0,10,7)})df_obj4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(df_obj3)print(df_obj4)
key data10 b 71 b 42 a 13 c 94 a 25 a 96 b 7 data2a 9b 4d 0
pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)
|
key |
data1 |
data2 |
| 0 |
b |
7 |
4 |
| 1 |
b |
4 |
4 |
| 6 |
b |
7 |
4 |
| 2 |
a |
1 |
9 |
| 4 |
a |
2 |
9 |
| 5 |
a |
9 |
9 |
按索引连接right_index
# 按索引连接df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0,10,7)})df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(df_obj1)print(df_obj2)
key data10 b 01 b 22 a 73 c 34 a 15 a 16 b 6 data2a 2b 1d 1
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
|
key |
data1 |
data2 |
| 0 |
b |
0 |
1 |
| 1 |
b |
2 |
1 |
| 6 |
b |
6 |
1 |
| 2 |
a |
7 |
2 |
| 4 |
a |
1 |
2 |
| 5 |
a |
1 |
2 |
四、数据合并
- 数据合并 concat
- 按索引连接===right_index
import numpy as npimport pandas as pd
numpy的concat
arr1 = np.random.randint(0, 10, (3, 4))arr2 = np.random.randint(0, 10, (3, 4))print (arr1)print (arr2)
[[6 0 3 2] [5 7 9 8] [5 8 0 3]][[6 5 7 9] [0 1 0 0] [1 1 1 7]]
np.concatenate([arr1, arr2])
array([[6, 0, 3, 2], [5, 7, 9, 8], [5, 8, 0, 3], [6, 5, 7, 9], [0, 1, 0, 0], [1, 1, 1, 7]])
np.concatenate([arr1, arr2], axis=1)
array([[6, 0, 3, 2, 6, 5, 7, 9], [5, 7, 9, 8, 0, 1, 0, 0], [5, 8, 0, 3, 1, 1, 1, 7]])
series上的concat
# index 没有重复的情况ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0 01 42 53 14 95 76 87 58 09 510 911 0dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)
|
0 |
1 |
2 |
| 0 |
0.0 |
NaN |
NaN |
| 1 |
4.0 |
NaN |
NaN |
| 2 |
5.0 |
NaN |
NaN |
| 3 |
1.0 |
NaN |
NaN |
| 4 |
9.0 |
NaN |
NaN |
| 5 |
NaN |
7.0 |
NaN |
| 6 |
NaN |
8.0 |
NaN |
| 7 |
NaN |
5.0 |
NaN |
| 8 |
NaN |
0.0 |
NaN |
| 9 |
NaN |
NaN |
5.0 |
| 10 |
NaN |
NaN |
9.0 |
| 11 |
NaN |
NaN |
0.0 |
# index 有重复的情况ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))print (ser_obj1)print (ser_obj2)print (ser_obj3)
0 51 32 03 84 3dtype: int320 51 32 23 1dtype: int320 51 82 6dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0 51 32 03 84 30 51 32 23 10 51 82 6dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')
|
0 |
1 |
2 |
| 0 |
5 |
5 |
5 |
| 1 |
3 |
3 |
8 |
| 2 |
0 |
2 |
6 |
dataframe上的concat
df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'], columns=['A', 'B'])df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'], columns=['C', 'D'])print (df_obj1)print (df_obj2)
A Ba 4 3b 8 1c 6 3 C Da 1 3b 8 2
pd.concat([df_obj1, df_obj2])
C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future versionof pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
|
A |
B |
C |
D |
| a |
4.0 |
3.0 |
NaN |
NaN |
| b |
8.0 |
1.0 |
NaN |
NaN |
| c |
6.0 |
3.0 |
NaN |
NaN |
| a |
NaN |
NaN |
1.0 |
3.0 |
| b |
NaN |
NaN |
8.0 |
2.0 |
pd.concat([df_obj1, df_obj2], axis=1)
C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future versionof pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
|
A |
B |
C |
D |
| a |
4 |
3 |
1.0 |
3.0 |
| b |
8 |
1 |
8.0 |
2.0 |
| c |
6 |
3 |
NaN |
NaN |
五、数据重构
import numpy as npimport pandas as pd
stack
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])df_obj
|
data1 |
data2 |
| 0 |
0 |
4 |
| 1 |
6 |
2 |
| 2 |
9 |
8 |
| 3 |
7 |
0 |
| 4 |
3 |
1 |
stacked = df_obj.stack()print (stacked)
0 data1 0 data2 41 data1 6 data2 22 data1 9 data2 83 data1 7 data2 04 data1 3 data2 1dtype: int32
print (type(stacked))print (type(stacked.index))
<class 'pandas.core.series.Series'><class 'pandas.core.indexes.multi.MultiIndex'>
unstack
# 默认操作内层索引stacked.unstack()
|
data1 |
data2 |
| 0 |
0 |
4 |
| 1 |
6 |
2 |
| 2 |
9 |
8 |
| 3 |
7 |
0 |
| 4 |
3 |
1 |
# 通过level指定操作索引的级别stacked.unstack(level=0)
|
0 |
1 |
2 |
3 |
4 |
| data1 |
0 |
6 |
9 |
7 |
3 |
| data2 |
4 |
2 |
8 |
0 |
1 |
六、数据转换
import numpy as npimport pandas as pd
重复数据duplicates函数
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4, 'data2' : np.random.randint(0, 4, 8)})df_obj
|
data1 |
data2 |
| 0 |
a |
3 |
| 1 |
a |
2 |
| 2 |
a |
2 |
| 3 |
a |
1 |
| 4 |
b |
0 |
| 5 |
b |
2 |
| 6 |
b |
2 |
| 7 |
b |
1 |
df_obj.duplicated()
0 False1 False2 True3 False4 False5 False6 True7 Falsedtype: bool
df_obj.drop_duplicates()
|
data1 |
data2 |
| 0 |
a |
3 |
| 1 |
a |
2 |
| 3 |
a |
1 |
| 4 |
b |
0 |
| 5 |
b |
2 |
| 7 |
b |
1 |
df_obj.drop_duplicates('data2')
|
data1 |
data2 |
| 0 |
a |
3 |
| 1 |
a |
2 |
| 3 |
a |
1 |
| 4 |
b |
0 |
map函数
ser_obj = pd.Series(np.random.randint(0,10,10))ser_obj
0 11 92 13 24 75 26 47 58 49 6dtype: int32
ser_obj.map(lambda x : x ** 2)
0 11 812 13 44 495 46 167 258 169 36dtype: int64
数据替换repalce
# 替换单个值ser_obj.replace(0, -100)
0 11 92 13 24 75 26 47 58 49 6dtype: int32
# 替换多个值ser_obj.replace([0, 2], -100)
0 11 92 13 -1004 75 -1006 47 58 49 6dtype: int32
# 替换多个值ser_obj.replace([0, 2], [-100, -200])
0 11 92 13 -2004 75 -2006 47 58 49 6dtype: int64
ser_obj.map(lambda x : x ** 2)#### 3. 数据替换repalce# 替换单个值ser_obj.replace(0, -100)# 替换多个值ser_obj.replace([0, 2], -100)# 替换多个值ser_obj.replace([0, 2], [-100, -200])
0 11 92 13 -2004 75 -2006 47 58 49 6dtype: int64