一、Pandas数据结构
import pandas as pd
Series
- 通过list构建Series
ser_obj = pd.Series(range(10, 20,2)) print (type(ser_obj)) print(ser_obj)
<class 'pandas.core.series.Series'> 0 10 1 12 2 14 3 16 4 18 dtype: int64
# 获取数据 print (ser_obj.values) # 获取索引 print (ser_obj.index) #范围索引数据类型 # 预览数据 print (ser_obj.head(3)) #默认输出五行
[10 12 14 16 18] RangeIndex(start=0, stop=5, step=1) 0 10 1 12 2 14 dtype: int64
- 通过dict构建Series
year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5,2004:324,2423:243} ser_obj2 = pd.Series(year_data) print (ser_obj2.head(2)) print (ser_obj2.index) print(ser_obj2)
2001 17.8 2002 20.1 dtype: float64 Int64Index([2001, 2002, 2003, 2004, 2423], dtype='int64') 2001 17.8 2002 20.1 2003 16.5 2004 324.0 2423 243.0 dtype: float64
# name属性【【【【【出问题了!!!】】】】】 ser_obj2.name = '钱' ser_obj2.index.name = 'year' print (ser_obj2.head())
year 2001 17.8 2002 20.1 2003 16.5 2004 324.0 2423 243.0 Name: 钱, dtype: float64
DataFrame
- 通过ndarray构建DataFrame
import numpy as np array = np.random.rand(5,4) print (array) df_obj = pd.DataFrame(array,columns=['a','b','c','d']) print (df_obj.head()) print(df_obj.sort_values(by='a', ascending=False))
[[0.23496522 0.92258429 0.36447462 0.52634697] [0.73743514 0.88175941 0.48944212 0.4173522 ] [0.21214568 0.57148666 0.59496072 0.49490723] [0.7458542 0.74743907 0.70475157 0.28130394] [0.43805937 0.90300134 0.00730653 0.68203725]] a b c d 0 0.234965 0.922584 0.364475 0.526347 1 0.737435 0.881759 0.489442 0.417352 2 0.212146 0.571487 0.594961 0.494907 3 0.745854 0.747439 0.704752 0.281304 4 0.438059 0.903001 0.007307 0.682037 a b c d 3 0.745854 0.747439 0.704752 0.281304 1 0.737435 0.881759 0.489442 0.417352 4 0.438059 0.903001 0.007307 0.682037 0 0.234965 0.922584 0.364475 0.526347 2 0.212146 0.571487 0.594961 0.494907
- 通过dict构建DataFrame
#一个键值对就相当于一列!!但是具体到字典里面的值所用到的一些函数还是不能很清楚 dict_data = {'A': 1., 'B': pd.Timestamp('20161217'), 'C': pd.Series(1, index=list(range(4)),dtype='float32'), 'D': np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["Python","Java","C++","C#"]), 'F' : 'ChinaHadoop' } df_obj2 = pd.DataFrame(dict_data) print (df_obj2.head())
A B C D E F 0 1.0 2016-12-17 1.0 3 Python ChinaHadoop 1 1.0 2016-12-17 1.0 3 Java ChinaHadoop 2 1.0 2016-12-17 1.0 3 C++ ChinaHadoop 3 1.0 2016-12-17 1.0 3 C# ChinaHadoop
# 增加列 df_obj2['G'] = df_obj2['D'] + 4 print (df_obj2.head()) xxx = pd.DataFrame(df_obj2,columns=['A','B','C','D','E','F','G','H'],index=[0,1,2,3,4]) print(xxx)
A B C D E F G 0 1.0 2016-12-17 1.0 3 Python ChinaHadoop 7 1 1.0 2016-12-17 1.0 3 Java ChinaHadoop 7 2 1.0 2016-12-17 1.0 3 C++ ChinaHadoop 7 3 1.0 2016-12-17 1.0 3 C# ChinaHadoop 7 A B C D E F G H 0 1.0 2016-12-17 1.0 3.0 Python ChinaHadoop 7.0 NaN 1 1.0 2016-12-17 1.0 3.0 Java ChinaHadoop 7.0 NaN 2 1.0 2016-12-17 1.0 3.0 C++ ChinaHadoop 7.0 NaN 3 1.0 2016-12-17 1.0 3.0 C# ChinaHadoop 7.0 NaN 4 NaN NaT NaN NaN NaN NaN NaN NaN
Index
print (type(ser_obj.index)) print (type(df_obj2.index)) print (df_obj2.index)
<class 'pandas.core.indexes.range.RangeIndex'> <class 'pandas.core.indexes.numeric.Int64Index'> Int64Index([0, 1, 2, 3], dtype='int64')
# 索引对象不可变 df_obj2.index[0] = 2
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-10-6367894e76d8> in <module> 1 # 索引对象不可变 ----> 2 df_obj2.index[0] = 2 ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value) 4258 4259 def __setitem__(self, key, value): -> 4260 raise TypeError("Index does not support mutable operations") 4261 4262 def __getitem__(self, key): TypeError: Index does not support mutable operations
二、Pandas数据操作
常用函数总结
·shape 获取数据的尺寸
获得df的size:df.shape 获得df中的行数:df.shape[0] 获得df中的列数: df.shape[1] 获得行索引信息:df.index 获得列索引信息:df.colomns
·values 获得df中的值===中文没用
df.values === 以列表的形式展现出来,去除了索引===dataframe类型数据转换成array类型
·setindex和resetindex
reset_index可以还原索引,从新变为默认的整型索引 DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill=”) level控制了具体要还原的那个等级的索引 drop为False则索引列会被还原为普通列,否则会丢失 set_index方法,设置单索引和复合索引抑或是添加索引。 DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False) append添加新索引,drop为False,inplace为True时,索引将会还原为列
·iterrows()遍历DataFrame中的数据
for index,row in df.iterrows():
·split(sep,n,expand=false)
sep表示用于分割的字符;n表格分割成多少列;expand表示是否展开为数据款,True输出Series,False输出Dataframe。 字段拆分:是指按照固定的字符,拆分已有字符串
import pandas as pd import numpy as np
匿名函数应用
# Numpy ufunc 函数 df = pd.DataFrame(np.random.randn(5,4) - 1) print (df) print (np.abs(df))
0 1 2 3 0 0.624016 -2.695175 -1.211426 -0.386151 1 -1.335385 -1.315232 -0.305902 -0.361348 2 -0.349443 -2.032110 0.075995 -0.966725 3 -1.631192 -1.051390 -1.767981 -0.366663 4 -0.786178 -0.335846 -0.797992 -0.931216 0 1 2 3 0 0.624016 2.695175 1.211426 0.386151 1 1.335385 1.315232 0.305902 0.361348 2 0.349443 2.032110 0.075995 0.966725 3 1.631192 1.051390 1.767981 0.366663 4 0.786178 0.335846 0.797992 0.931216
# 使用apply应用行或列数据 f = lambda x : x.max() print (df.apply(f))
0 0.624016 1 -0.335846 2 0.075995 3 -0.361348 dtype: float64
# 指定轴方向 print (df.apply(f, axis=1))
0 0.624016 1 -0.305902 2 0.075995 3 -0.366663 4 -0.335846 dtype: float64
# 使用applymap应用到每个数据 f2 = lambda x : '%.2f' % x print (df.applymap(f2))
0 1 2 3 0 0.62 -2.70 -1.21 -0.39 1 -1.34 -1.32 -0.31 -0.36 2 -0.35 -2.03 0.08 -0.97 3 -1.63 -1.05 -1.77 -0.37 4 -0.79 -0.34 -0.80 -0.93
排序
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5)) print (s4)
4 10 1 11 4 12 1 13 1 14 dtype: int64
- 索引排序
s4.sort_index()
1 11 1 13 1 14 4 10 4 12 dtype: int64
df4 = pd.DataFrame(np.random.randn(3, 4), index=np.random.randint(3, size=3), columns=np.random.randint(4, size=4))
df4
2 | 1 | 3 | 1 | |
0 | 0.007031 | 1.261990 | -1.647929 | 0.176549 |
1 | -2.510698 | -0.207659 | 0.628221 | 0.441352 |
0 | -0.367051 | 1.536606 | 0.167158 | -0.236129 |
#df4.sort_index(ascending=False) df4.sort_index(axis=1)
1 | 1 | 2 | 3 | |
0 | 1.261990 | 0.176549 | 0.007031 | -1.647929 |
1 | -0.207659 | 0.441352 | -2.510698 | 0.628221 |
0 | 1.536606 | -0.236129 | -0.367051 | 0.167158 |
- 按值排序
#df.sortvalues(by='a', ascending=False) === 通过a的值 # 作用是对选定的一列数值('a')数据从上往下从小到大进行排序(如果传值没成功===设置本体覆盖,传值覆盖) df4.sort_values(by=1)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-22-36ffa8ddd07d> in <module> 2 #df.sortvalues(by='a', ascending=False) === 通过a的值 3 # 作用是对选定的一列数值('a')数据从上往下从小到大进行排序(如果传值没成功===设置本体覆盖,传值覆盖) ----> 4 df4.sort_values(by=1) ~\Anaconda3\lib\site-packages\pandas\core\frame.py in sort_values(self, by, axis, ascending, inplace, kind, na_position) 4991 4992 by = by[0] -> 4993 k = self._get_label_or_level_values(by, axis=axis) 4994 4995 if isinstance(ascending, (tuple, list)): ~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_label_or_level_values(self, key, axis) 1795 key=key, 1796 label_axis_name=label_axis_name, -> 1797 multi_message=multi_message, 1798 ) 1799 ) ValueError: The column label '1' is not unique.
评论区