Notebook – Quick Visualization and Eda for Beginners

阿里云双11来了!从本博客参与阿里云,服务器最低只要86元/年!

https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python

1 Importing

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
df_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
df_test  = pd.read_csv('../input/learn-together/test.csv',index_col='Id')

2 EDA

df_train.head()
df_train.info()
df_train.describe().T
df_train.iloc[:, 10:-1].colums
# change columns to categorical ones
df_train.iloc[:,10:-1] = df_train.iloc[:,10:-1].astype("category")
df_test.iloc[:,10:] = df_test.iloc[:,10:].astype("category")

热力图

f,ax = plt.subplots(figsize=(8,6))
sns.heatmap(df_train.corr(),annot=True, 
            linewidths=.5, fmt='.1f', ax=ax)
plt.show()

3 Data visualization

scatter 图

df_train.plot(kind='scatter', x='Vertical_Distance_To_Hydrology', 
              y='Horizontal_Distance_To_Hydrology', alpha=0.5, 
              color='darkblue', figsize = (12,9)
             )
plt.title('Vertical And Horizontal Distance To Hydrology')
plt.xlabel("Vertical Distance")
plt.ylabel("Horizontal Distance")
plt.show()

box plot

trace1 = go.Box(
    y=df_train["Vertical_Distance_To_Hydrology"],
    name = 'Vertical Distance',
    marker = dict(color = 'rgb(0,145,119)')
)
trace2 = go.Box(
    y=df_train["Horizontal_Distance_To_Hydrology"],
    name = 'Horizontal Distance',
    marker = dict(color = 'rgb(5, 79, 174)')
)
data = [trace1, trace2]
layout = dict(autosize=False, width=700,height=500, title='Distance To Hydrology', paper_bgcolor='rgb(243, 243, 243)', 
              plot_bgcolor='rgb(243, 243, 243)', margin=dict(l=40,r=30,b=80,t=100,)
             )
fig = dict(data=data, layout=layout)
iplot(fig)

histogram

f,ax=plt.subplots(1,2,figsize=(15,7))
df_train.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,
                                                  edgecolor='black',color='crimson')
ax[0].set_title('Vertical Distance To Hydrology')
x1=list(range(-150,350,50))
ax[0].set_xticks(x1)
df_train.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,
                                                    edgecolor='black',color='darkmagenta')
ax[1].set_title('Horizontal Distance To Hydrology')
x2=list(range(0,1000,100))
ax[1].set_xticks(x2)
plt.show()

bar plot

soil_types = df_train.iloc[:,14:-1].sum(axis=0)
plt.figure(figsize=(18,9))
sns.barplot(x=soil_types.index, y=soil_types.values, 
            palette="rocket"
           )
plt.xticks(rotation= 75)
plt.ylabel('Total')
plt.title('Count of Soil Types With Value 1',color = 'darkred',fontsize=12)
plt.show()

4 Pandas Profiling

report = pp.ProfileReport(df_train)
report.to_file("report.html")
report

https://www.jianshu.com/p/77fc509cb8ea

Python量化投资网携手4326手游为资深游戏玩家推荐:《三途灵境 幽芷盛开《阴阳师》彼岸花全新皮肤上线!

「点点赞赏,手留余香」

    还没有人赞赏,快来当第一个赞赏的人吧!
0 条回复 A 作者 M 管理员
    所有的伟大,都源于一个勇敢的开始!
欢迎您,新朋友,感谢参与互动!欢迎您 {{author}},您在本站有{{commentsCount}}条评论