规整数据:连接、合并、重构、转换etc
三、数据规整-连接-含索引
- 数据连接 merge
import pandas as pd import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : ['sfd','fdsf','we',24,3253,234,23]}) df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2' : np.random.randint(0,10,3)}) print (df_obj1) print (df_obj2)
key data1 0 b sfd 1 b fdsf 2 a we 3 c 24 4 a 3253 5 a 234 6 b 23 key data2 0 a 0 1 b 6 2 d 6
merge和on
默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)
key | data1 | data2 | |
0 | b | sfd | 6 |
1 | b | fdsf | 6 |
2 | b | 23 | 6 |
3 | a | we | 0 |
4 | a | 3253 | 0 |
5 | a | 234 | 0 |
# on显示指定“外键” pd.merge(df_obj1, df_obj2, on='key')
key | data1 | data2 | |
0 | b | sfd | 6 |
1 | b | fdsf | 6 |
2 | b | 23 | 6 |
3 | a | we | 0 |
4 | a | 3253 | 0 |
5 | a | 234 | 0 |
left-on和right-on
# left_on,right_on分别指定左侧数据和右侧数据的“外键” # 更改列名 df_obj1 = df_obj1.rename(columns={'key':'key1'}) df_obj2 = df_obj2.rename(columns={'key':'key2'})
print(df_obj1) print(df_obj2)
key1 data1 0 b sfd 1 b fdsf 2 a we 3 c 24 4 a 3253 5 a 234 6 b 23 key2 data2 0 a 0 1 b 6 2 d 6
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
key1 | data1 | key2 | data2 | |
0 | b | sfd | b | 6 |
1 | b | fdsf | b | 6 |
2 | b | 23 | b | 6 |
3 | a | we | a | 0 |
4 | a | 3253 | a | 0 |
5 | a | 234 | a | 0 |
how
# “外连接” pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
key1 | data1 | key2 | data2 | |
0 | b | sfd | b | 6.0 |
1 | b | fdsf | b | 6.0 |
2 | b | 23 | b | 6.0 |
3 | a | we | a | 0.0 |
4 | a | 3253 | a | 0.0 |
5 | a | 234 | a | 0.0 |
6 | c | 24 | NaN | NaN |
7 | NaN | NaN | d | 6.0 |
# 左连接 pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
key1 | data1 | key2 | data2 | |
0 | b | sfd | b | 6.0 |
1 | b | fdsf | b | 6.0 |
2 | a | we | a | 0.0 |
3 | c | 24 | NaN | NaN |
4 | a | 3253 | a | 0.0 |
5 | a | 234 | a | 0.0 |
6 | b | 23 | b | 6.0 |
# 右连接 pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
key1 | data1 | key2 | data2 | |
0 | b | sfd | b | 6 |
1 | b | fdsf | b | 6 |
2 | b | 23 | b | 6 |
3 | a | we | a | 0 |
4 | a | 3253 | a | 0 |
5 | a | 234 | a | 0 |
6 | NaN | NaN | d | 6 |
处理重复列名suffixes
# 处理重复列名 df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data' : np.random.randint(0,10,7)}) df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data' : np.random.randint(0,10,3)}) pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
key | data_left | data_right | |
0 | b | 9 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 7 | 1 |
4 | a | 3 | 1 |
5 | a | 4 | 1 |
# 按索引连接 df_obj3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0,10,7)}) df_obj4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(df_obj3) print(df_obj4)
key data1 0 b 7 1 b 4 2 a 1 3 c 9 4 a 2 5 a 9 6 b 7 data2 a 9 b 4 d 0
pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)
key | data1 | data2 | |
0 | b | 7 | 4 |
1 | b | 4 | 4 |
6 | b | 7 | 4 |
2 | a | 1 | 9 |
4 | a | 2 | 9 |
5 | a | 9 | 9 |
按索引连接right_index
# 按索引连接 df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0,10,7)}) df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(df_obj1) print(df_obj2)
key data1 0 b 0 1 b 2 2 a 7 3 c 3 4 a 1 5 a 1 6 b 6 data2 a 2 b 1 d 1
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
key | data1 | data2 | |
0 | b | 0 | 1 |
1 | b | 2 | 1 |
6 | b | 6 | 1 |
2 | a | 7 | 2 |
4 | a | 1 | 2 |
5 | a | 1 | 2 |
四、数据合并
- 数据合并 concat
- 按索引连接===right_index
import numpy as np import pandas as pd
numpy的concat
arr1 = np.random.randint(0, 10, (3, 4)) arr2 = np.random.randint(0, 10, (3, 4)) print (arr1) print (arr2)
[[6 0 3 2] [5 7 9 8] [5 8 0 3]] [[6 5 7 9] [0 1 0 0] [1 1 1 7]]
np.concatenate([arr1, arr2])
array([[6, 0, 3, 2], [5, 7, 9, 8], [5, 8, 0, 3], [6, 5, 7, 9], [0, 1, 0, 0], [1, 1, 1, 7]])
np.concatenate([arr1, arr2], axis=1)
array([[6, 0, 3, 2, 6, 5, 7, 9], [5, 7, 9, 8, 0, 1, 0, 0], [5, 8, 0, 3, 1, 1, 1, 7]])
series上的concat
# index 没有重复的情况 ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5)) ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9)) ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0 0 1 4 2 5 3 1 4 9 5 7 6 8 7 5 8 0 9 5 10 9 11 0 dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)
0 | 1 | 2 | |
0 | 0.0 | NaN | NaN |
1 | 4.0 | NaN | NaN |
2 | 5.0 | NaN | NaN |
3 | 1.0 | NaN | NaN |
4 | 9.0 | NaN | NaN |
5 | NaN | 7.0 | NaN |
6 | NaN | 8.0 | NaN |
7 | NaN | 5.0 | NaN |
8 | NaN | 0.0 | NaN |
9 | NaN | NaN | 5.0 |
10 | NaN | NaN | 9.0 |
11 | NaN | NaN | 0.0 |
# index 有重复的情况 ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5)) ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4)) ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3)) print (ser_obj1) print (ser_obj2) print (ser_obj3)
0 5 1 3 2 0 3 8 4 3 dtype: int32 0 5 1 3 2 2 3 1 dtype: int32 0 5 1 8 2 6 dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0 5 1 3 2 0 3 8 4 3 0 5 1 3 2 2 3 1 0 5 1 8 2 6 dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')
0 | 1 | 2 | |
0 | 5 | 5 | 5 |
1 | 3 | 3 | 8 |
2 | 0 | 2 | 6 |
dataframe上的concat
df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'], columns=['A', 'B']) df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'], columns=['C', 'D']) print (df_obj1) print (df_obj2)
A B a 4 3 b 8 1 c 6 3 C D a 1 3 b 8 2
pd.concat([df_obj1, df_obj2])
C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
A | B | C | D | |
a | 4.0 | 3.0 | NaN | NaN |
b | 8.0 | 1.0 | NaN | NaN |
c | 6.0 | 3.0 | NaN | NaN |
a | NaN | NaN | 1.0 | 3.0 |
b | NaN | NaN | 8.0 | 2.0 |
pd.concat([df_obj1, df_obj2], axis=1)
C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
A | B | C | D | |
a | 4 | 3 | 1.0 | 3.0 |
b | 8 | 1 | 8.0 | 2.0 |
c | 6 | 3 | NaN | NaN |
五、数据重构
import numpy as np import pandas as pd
stack
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2']) df_obj
data1 | data2 | |
0 | 0 | 4 |
1 | 6 | 2 |
2 | 9 | 8 |
3 | 7 | 0 |
4 | 3 | 1 |
stacked = df_obj.stack() print (stacked)
0 data1 0 data2 4 1 data1 6 data2 2 2 data1 9 data2 8 3 data1 7 data2 0 4 data1 3 data2 1 dtype: int32
print (type(stacked)) print (type(stacked.index))
<class 'pandas.core.series.Series'> <class 'pandas.core.indexes.multi.MultiIndex'>
unstack
# 默认操作内层索引 stacked.unstack()
data1 | data2 | |
0 | 0 | 4 |
1 | 6 | 2 |
2 | 9 | 8 |
3 | 7 | 0 |
4 | 3 | 1 |
# 通过level指定操作索引的级别 stacked.unstack(level=0)
0 | 1 | 2 | 3 | 4 | |
data1 | 0 | 6 | 9 | 7 | 3 |
data2 | 4 | 2 | 8 | 0 | 1 |
六、数据转换
import numpy as np import pandas as pd
重复数据duplicates函数
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4, 'data2' : np.random.randint(0, 4, 8)}) df_obj
data1 | data2 | |
0 | a | 3 |
1 | a | 2 |
2 | a | 2 |
3 | a | 1 |
4 | b | 0 |
5 | b | 2 |
6 | b | 2 |
7 | b | 1 |
df_obj.duplicated()
0 False 1 False 2 True 3 False 4 False 5 False 6 True 7 False dtype: bool
df_obj.drop_duplicates()
data1 | data2 | |
0 | a | 3 |
1 | a | 2 |
3 | a | 1 |
4 | b | 0 |
5 | b | 2 |
7 | b | 1 |
df_obj.drop_duplicates('data2')
data1 | data2 | |
0 | a | 3 |
1 | a | 2 |
3 | a | 1 |
4 | b | 0 |
map函数
ser_obj = pd.Series(np.random.randint(0,10,10)) ser_obj
0 1 1 9 2 1 3 2 4 7 5 2 6 4 7 5 8 4 9 6 dtype: int32
ser_obj.map(lambda x : x ** 2)
0 1 1 81 2 1 3 4 4 49 5 4 6 16 7 25 8 16 9 36 dtype: int64
数据替换repalce
# 替换单个值 ser_obj.replace(0, -100)
0 1 1 9 2 1 3 2 4 7 5 2 6 4 7 5 8 4 9 6 dtype: int32
# 替换多个值 ser_obj.replace([0, 2], -100)
0 1 1 9 2 1 3 -100 4 7 5 -100 6 4 7 5 8 4 9 6 dtype: int32
# 替换多个值 ser_obj.replace([0, 2], [-100, -200])
0 1 1 9 2 1 3 -200 4 7 5 -200 6 4 7 5 8 4 9 6 dtype: int64
ser_obj.map(lambda x : x ** 2) #### 3. 数据替换repalce # 替换单个值 ser_obj.replace(0, -100) # 替换多个值 ser_obj.replace([0, 2], -100) # 替换多个值 ser_obj.replace([0, 2], [-100, -200])
0 1 1 9 2 1 3 -200 4 7 5 -200 6 4 7 5 8 4 9 6 dtype: int64
评论区