清洗数据:删除指定数据、处理缺失数据etc
一、数据预览:tail()、head()
import numpy as npimport pandas as pddf_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])print(df_obj.tail())# 数据预览尾巴print(df_obj.head())# 数据预览头部
a b c d0 -0.507788 0.213237 0.003150 -0.7773121 -0.896653 -2.188016 -0.114848 0.1670572 -1.131242 -0.142287 -1.027330 1.8618143 0.369608 0.823453 1.030830 -0.0417784 -0.647625 0.056791 -0.394078 -1.347718a b c d0 -0.507788 0.213237 0.003150 -0.7773121 -0.896653 -2.188016 -0.114848 0.1670572 -1.131242 -0.142287 -1.027330 1.8618143 0.369608 0.823453 1.030830 -0.0417784 -0.647625 0.056791 -0.394078 -1.347718
二、数据描述:shape、info()
print ('数据集有%i行,%i列' %(df_obj.shape[0], df_obj.shape[1]))
数据集有5行,4列
print(df_obj.info())
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5 entries, 0 to 4Data columns (total 4 columns):a 5 non-null float64b 5 non-null float64c 5 non-null float64d 5 non-null float64dtypes: float64(4)memory usage: 288.0 bytesNone
三、数据统计:describe()
print(df_obj.describe())
a b c dcount 5.000000 5.000000 5.000000 5.000000mean -0.562740 -0.247365 -0.100455 -0.027587std 0.573191 1.143294 0.747673 1.215808min -1.131242 -2.188016 -1.027330 -1.34771825% -0.896653 -0.142287 -0.394078 -0.77731250% -0.647625 0.056791 -0.114848 -0.04177875% -0.507788 0.213237 0.003150 0.167057max 0.369608 0.823453 1.030830 1.861814
四、pandas不完全显示行列
pd.set_option('display.max_rows', 100) //显示的最大行数(避免只显示部分行数据)pd.set_option('display.max_columns', 1000) //显示的最大列数(避免列显示不全)pd.set_option("display.max_colwidth", 1000) //每一列最大的宽度(避免属性值或列名显示不全)pd.set_option('display.width', 1000) //每一行的宽度(避免换行)
五、删除指定行列数据
import pandas as pdimport numpy as np
dict_data = {'A': 1.,'B': pd.Timestamp('20161217'),'C': pd.Series(1, index=list(range(4)),dtype='float32'),'D': np.array([3] * 4,dtype='int32'),'E' : pd.Categorical(["Python","Java","C++","C#"]),'F' : 'ChinaHadoop' }df_obj2 = pd.DataFrame(dict_data)print(df_obj2)
A B C D E F0 1.0 2016-12-17 1.0 3 Python ChinaHadoop1 1.0 2016-12-17 1.0 3 Java ChinaHadoop2 1.0 2016-12-17 1.0 3 C++ ChinaHadoop3 1.0 2016-12-17 1.0 3 C# ChinaHadoop
del
删除列
del df_obj2['A']print (df_obj2.head())
B C D E F0 2016-12-17 1.0 3 Python ChinaHadoop1 2016-12-17 1.0 3 Java ChinaHadoop2 2016-12-17 1.0 3 C++ ChinaHadoop3 2016-12-17 1.0 3 C# ChinaHadoop
drop
删除行/列数据
dict_data = {'A': 1.,'B': pd.Timestamp('20161217'),'C': pd.Series(1, index=list(range(4)),dtype='float32'),'D': np.array([3] * 4,dtype='int32'),'E' : pd.Categorical(["Python","Java","C++","C#"]),'F' : 'ChinaHadoop' }df_obj3 = pd.DataFrame(dict_data,index = ['sfd','sdfd','wer','rwer'])print (df_obj3.head(7))print(df_obj3.drop('wer'))#删除行print(df_obj3.drop('F',axis=1))#删除列
A B C D E Fsfd 1.0 2016-12-17 NaN 3 Python ChinaHadoopsdfd 1.0 2016-12-17 NaN 3 Java ChinaHadoopwer 1.0 2016-12-17 NaN 3 C++ ChinaHadooprwer 1.0 2016-12-17 NaN 3 C# ChinaHadoopA B C D E Fsfd 1.0 2016-12-17 NaN 3 Python ChinaHadoopsdfd 1.0 2016-12-17 NaN 3 Java ChinaHadooprwer 1.0 2016-12-17 NaN 3 C# ChinaHadoopA B C D Esfd 1.0 2016-12-17 NaN 3 Pythonsdfd 1.0 2016-12-17 NaN 3 Javawer 1.0 2016-12-17 NaN 3 C++rwer 1.0 2016-12-17 NaN 3 C#
六、处理缺失数据
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],[4., np.nan, np.nan], [1., np.nan, 2.]])df_data.head()
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | -0.702713 | -0.991383 | -1.058464 |
| 1 | 1.000000 | NaN | NaN |
| 2 | 4.000000 | NaN | NaN |
| 3 | 1.000000 | NaN | 2.000000 |
判断是否存在缺失值
df_data.isnull()
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | False | False | False |
| 1 | False | True | True |
| 2 | False | True | True |
| 3 | False | True | False |
丢弃缺失数据
print(df_data.dropna(axis=0))#0是行;1是列
0 1 20 -0.702713 -0.991383 -1.058464
填充缺失数据
df_data.fillna(-100.)
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | -0.702713 | -0.991383 | -1.058464 |
| 1 | 1.000000 | -100.000000 | -100.000000 |
| 2 | 4.000000 | -100.000000 | -100.000000 |
| 3 | 1.000000 | -100.000000 | 2.000000 |
