pandas学习1

阿里云双11来了!从本博客参与阿里云,服务器最低只要86元/年!

该库基于numpy,更加方便啦~

数据的读取
import pandas
food_info = pandas.read_csv("food_info.csv")
#print(type(food_info))
print (food_info.dtypes) #数据类型
#print (help(pandas.read_csv)) #查read_csv的帮助文档
NDB_No               int64 #整型
Shrt_Desc           object  #字符 string
Water_(g)          float64  #浮点型
Energ_Kcal           int64
Protein_(g)        float64
Lipid_Tot_(g)      float64
Ash_(g)            float64
Carbohydrt_(g)     float64
Fiber_TD_(g)       float64
Sugar_Tot_(g)      float64
Calcium_(mg)       float64
Iron_(mg)          float64
Magnesium_(mg)     float64
Phosphorus_(mg)    float64
Potassium_(mg)     float64
Sodium_(mg)        float64
Zinc_(mg)          float64
Copper_(mg)        float64
Manganese_(mg)     float64
Selenium_(mcg)     float64
Vit_C_(mg)         float64
Thiamin_(mg)       float64
Riboflavin_(mg)    float64
Niacin_(mg)        float64
Vit_B6_(mg)        float64
Vit_B12_(mcg)      float64
Vit_A_IU           float64
Vit_A_RAE          float64
Vit_E_(mg)         float64
Vit_D_mcg          float64
Vit_D_IU           float64
Vit_K_(mcg)        float64
FA_Sat_(g)         float64
FA_Mono_(g)        float64
FA_Poly_(g)        float64
Cholestrl_(mg)     float64
dtype: object
#first_rows = food_info.head() # 前多少条数据
#print first_rows
#print(food_info.head(3))
#print(food_info.tail(3))  # 后3条数据
#print food_info.columns  # 列名
#print food_info.shape # 查看dataframe的维度
(8618, 36)
#pandas uses zero-indexing
#Series object representing the row at index 0.
#print food_info.loc[0]  #location  取第一条数据
# Series object representing the seventh row.
#food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python
#数据类型解释
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#print(food_info.dtypes)
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
#food_info.loc[3:6]  # 取对应区间的值
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]   # 取对应行位置的值
#food_info.loc[two_five_ten]
# Method 2
#food_info.loc[[2,5,10]]


image.png

# Series object representing the "NDB_No" column.
#ndb_col = food_info["NDB_No"]  #取对应的列
#print ndb_col
# Alternatively, you can access a column by passing in a string variable.
#col_name = "NDB_No"
#ndb_col = food_info[col_name]
#columns = ["Zinc_(mg)", "Copper_(mg)"]   #查多列 组成 list结构
#zinc_copper = food_info[columns]
#print zinc_copper
#print zinc_copper
# Skipping the assignment.
#zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()  #把列名转为list
#print col_names
gram_columns = []
for c in col_names:
    if c.endswith("(g)"):  判断是否每一项以(g)结尾
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
   Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \
0      15.87         0.85          81.11     2.11            0.06   
1      15.87         0.85          81.11     2.11            0.06   
2       0.24         0.28          99.48     0.00            0.00   
   Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  
0           0.0           0.06      51.368       21.021        3.043  
1           0.0           0.06      50.489       23.426        3.012  
2           0.0           0.00      61.924       28.732        3.694  
数据的运算
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
   NDB_No                 Shrt_Desc  Water_(g)  Energ_Kcal  Protein_(g)  \
0    1001          BUTTER WITH SALT      15.87         717         0.85   
1    1002  BUTTER WHIPPED WITH SALT      15.87         717         0.85   
2    1003      BUTTER OIL ANHYDROUS       0.24         876         0.28   
   Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  Fiber_TD_(g)  Sugar_Tot_(g)  \
0          81.11     2.11            0.06           0.0           0.06   
1          81.11     2.11            0.06           0.0           0.06   
2          99.48     0.00            0.00           0.0           0.00   
        ...        Vit_A_IU  Vit_A_RAE  Vit_E_(mg)  Vit_D_mcg  Vit_D_IU  \
0       ...          2499.0      684.0        2.32        1.5      60.0   
1       ...          2499.0      684.0        2.32        1.5      60.0   
2       ...          3069.0      840.0        2.80        1.8      73.0   
   Vit_K_(mcg)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  Cholestrl_(mg)  
0          7.0      51.368       21.021        3.043           215.0  
1          7.0      50.489       23.426        3.012           219.0  
2          8.6      61.924       28.732        3.694           256.0  
[3 rows x 36 columns]
#print food_info["Iron_(mg)"]
#div_1000 = food_info["Iron_(mg)"] / 1000   # 对每个值除以1000
#print div_1000
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]   # 对应位置进行运算操作
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000  
food_info["Iron_(g)"] = iron_grams  #添加一个列名字段,增加维度
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()   # min()最小值   mean()均值
# 归一化处理,该列每一项除以最大值
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print food_info["Sodium_(mg)"]
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print food_info["Sodium_(mg)"]
760     0.0
610     0.0
611     0.0
8387    0.0
8607    0.0
629     0.0
630     0.0
631     0.0
6470    0.0
654     0.0
8599    0.0
633     0.0
634     0.0
635     0.0
637     0.0
638     0.0
639     0.0
646     0.0
         ...   
8153        NaN
8155        NaN
8156        NaN
8157        NaN
8158        NaN
8159        NaN
8160        NaN
8161        NaN
8163        NaN
8164        NaN
8165        NaN
8167        NaN
8169        NaN
8170        NaN
8172        NaN
8173        NaN
8174        NaN
8175        NaN
8176        NaN
8177        NaN
8178        NaN
8179        NaN
8180        NaN
8181        NaN
8183        NaN
8184        NaN
8185        NaN
8195        NaN
8251        NaN
8267        NaN
Name: Sodium_(mg), dtype: float64

https://www.jianshu.com/p/c417b22c62bc

Python量化投资网携手4326手游为资深游戏玩家推荐:《《非人学园》:更新公告丨你老婆的时装限时6折!偶像祭开幕啦

「点点赞赏,手留余香」

    还没有人赞赏,快来当第一个赞赏的人吧!
0 条回复 A 作者 M 管理员
    所有的伟大,都源于一个勇敢的开始!
欢迎您,新朋友,感谢参与互动!欢迎您 {{author}},您在本站有{{commentsCount}}条评论