pandas学习2

数据预处理案例 -泰坦尼克号获救预测

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()


image.png

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]  # 取列
#print(age.loc[0:10]) # 取前10行
age_is_null = pd.isnull(age) #判断是否为空值
#print age_is_null  返回一个list
age_null_true = age[age_is_null]  #list作为索引,找出空项
#print age_null_true
age_null_count = len(age_null_true)   #空项数量  len()函数
print(age_null_count)
177
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])  #有空值 导致结果也会计算错误
print mean_age
nan
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]  #list作为索引 判断筛选不为空的项
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print correct_mean_age
29.6991176471
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()  # 自带的函数,计算均值
print correct_mean_age
29.6991176471
#mean fare for each class  每个等级的平均票价  -- 一般的循环计算
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print fares_by_class
{1: 84.154687499999994, 2: 20.662183152173913, 3: 13.675550101832993}
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
#pivot_table()统计函数的运用 index:维度、按照什么键分类;values:值  afffunc:计算方法  np.mean采用均值
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)   
print passenger_survival
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
# 默认按照求均值进行关系计算
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)  # Embarked登船地点 与 Fare票价 和 Survived获救 之间的关系  统计票价及人数和
print(port_stats)
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)   #删除按列 
# 查看行中 age sex两列为空,直接去掉该行
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
#print new_titanic_survival
# axis 1 列   0行   ,axis=0表示列维度,axis=1表示行维度
df[df.isnull()]  #返回的是个true或false的Series对象(掩码对象),进而筛选出我们需要的特定数据。
df[df.notnull()]
df.dropna()     #将所有含有nan项的row删除
df.dropna(axis=1,thresh=3)  #将在列的方向上三个为NaN的项删除
df.dropna(how='ALL')        #将全部项都是nan的row删除
df.fillna(0)
df.fillna({1:0, 2:0.5})         #对第一列nan值赋0,第二列赋值0.5
df.fillna(method='ffill')   #在列方向上以前一个值作为值赋给NaN
row_index_83_age = titanic_survival.loc[83,"Age"]  # 行号  加 列号 定位具体的一个值
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print row_index_83_age
print row_index_1000_pclass
28.0
1
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)  按照年龄排序了
print new_titanic_survival[0:10]  # 默认还是按照原来的index排序
itanic_reindexed = new_titanic_survival.reset_index(drop=True) #重置原来的index排序,将以age排序
print(titanic_reindexed.iloc[0:10])  # 通过索引 index的值进行取值
     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   
      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked age_labels  
630  male  80.0      0      0       27042  30.0000   A23        S      adult  
851  male  74.0      0      0      347060   7.7750   NaN        S      adult  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C      adult  
96   male  71.0      0      0    PC 17754  34.6542    A5        C      adult  
116  male  70.5      0      0      370369   7.7500   NaN        Q      adult  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S      adult  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S      adult  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S      adult  
54   male  65.0      0      1      113509  61.9792   B30        C      adult  
280  male  65.0      0      0      336439   7.7500   NaN        Q      adult  
   PassengerId  Survived  Pclass                                  Name   Sex  \
0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1          852         0       3                   Svensson, Mr. Johan  male   
2          494         0       1               Artagaveytia, Mr. Ramon  male   
3           97         0       1             Goldschmidt, Mr. George B  male   
4          117         0       3                  Connors, Mr. Patrick  male   
5          673         0       2           Mitchell, Mr. Henry Michael  male   
6          746         0       1          Crosby, Capt. Edward Gifford  male   
7           34         0       2                 Wheadon, Mr. Edward H  male   
8           55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9          281         0       3                      Duane, Mr. Frank  male   
    Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0  80.0      0      0       27042  30.0000   A23        S  
1  74.0      0      0      347060   7.7750   NaN        S  
2  71.0      0      0    PC 17609  49.5042   NaN        C  
3  71.0      0      0    PC 17754  34.6542    A5        C  
4  70.5      0      0      370369   7.7500   NaN        Q  
5  70.0      0      0  C.A. 24580  10.5000   NaN        S  
6  70.0      1      1   WE/P 5735  71.0000   B22        S  
7  66.0      0      0  C.A. 24579  10.5000   NaN        S  
8  65.0      0      1      113509  61.9792   B30        C  
9  65.0      0      0      336439   7.7500   NaN        Q  
自定义函数的使用

df.apply(定义的函数名)

# This function returns the hundredth item from a series
# 返回系列的100个值
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print hundredth_row
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
age_labels                 adult
dtype: object
#打印出 DF中每一列缺失值的数量
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print column_null_count
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
age_labels       0
dtype: int64
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
#对某列的数据进行转换处理   按类处理,替代sql的case  when  then else  end  采用 if elseif  else;
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"
#设置axis=1,使其遍历行   而不是列  (默认遍历列)
classes = titanic_survival.apply(which_class, axis=1)
print classes
0       Third Class
1       First Class
2       Third Class
3       First Class
4       Third Class
5       Third Class
6       First Class
7       Third Class
8       Third Class
9      Second Class
10      Third Class
11      First Class
12      Third Class
13      Third Class
14      Third Class
15     Second Class
16      Third Class
17     Second Class
18      Third Class
19      Third Class
20     Second Class
21     Second Class
22      Third Class
23      First Class
24      Third Class
25      Third Class
26      Third Class
27      First Class
28      Third Class
29      Third Class
           ...     
861    Second Class
862     First Class
863     Third Class
864    Second Class
865    Second Class
866    Second Class
867     First Class
868     Third Class
869     Third Class
870     Third Class
871     First Class
872     First Class
873     Third Class
874    Second Class
875     Third Class
876     Third Class
877     Third Class
878     Third Class
879     First Class
880    Second Class
881     Third Class
882     Third Class
883    Second Class
884     Third Class
885     Third Class
886    Second Class
887     First Class
888     Third Class
889     First Class
890     Third Class
dtype: object
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False
minors = titanic_survival.apply(is_minor, axis=1)
#print minors
def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print age_labels
0        adult
1        adult
2        adult
3        adult
4        adult
5      unknown
6        adult
7        minor
8        adult
9        minor
10       minor
11       adult
12       adult
13       adult
14       minor
15       adult
16       minor
17     unknown
18       adult
19     unknown
20       adult
21       adult
22       minor
23       adult
24       minor
25       adult
26     unknown
27       adult
28     unknown
29     unknown
        ...   
861      adult
862      adult
863    unknown
864      adult
865      adult
866      adult
867      adult
868    unknown
869      minor
870      adult
871      adult
872      adult
873      adult
874      adult
875      minor
876      adult
877      adult
878    unknown
879      adult
880      adult
881      adult
882      adult
883      adult
884      adult
885      adult
886      adult
887      adult
888    unknown
889      adult
890      adult
dtype: object
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print age_group_survival
age_labels
adult      0.381032
minor      0.539823
unknown    0.293785
Name: Survived, dtype: float64

https://www.jianshu.com/p/2e3656430b47

Python量化投资网携手4326手游为资深游戏玩家推荐:《梦幻西游下载

「点点赞赏,手留余香」

    还没有人赞赏,快来当第一个赞赏的人吧!
Keras
0 条回复 A 作者 M 管理员
    所有的伟大,都源于一个勇敢的开始!
欢迎您,新朋友,感谢参与互动!欢迎您 {{author}},您在本站有{{commentsCount}}条评论