Python pandas

Rhythm

pandas辅助包: pandasgui(可以详细pandas对象的数值,并且可以进行一些简单可视化操作:画图,导出等)

import pandas as pd
from pandasgui import show
df = pd.read_csv(in_file)
show(df)

数据读取:

read_csv(读取csv比读取excel快)

tec_out_djm_label=['year','doy','value']
df=pd.read_csv(in_file,delim_whitespace=True,names=tec_out_djm_label)
# 如果数据里面第一列无用,可以删掉 (一般pandas输出的第一列都带index)
if 'Unnamed: 0' in df.columns:
    print('Unnamed: 0 in df.columns \n' )
    df = df.iloc[:,1:]
#读取压缩文件
#compression following possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
df=pd.read_csv(in_file,compression='gzip',index_col=0,delim_whitespace=True,names=tec_out_djm_label)
# 如果不用panda的gzip读取,可以使用gzip
import gzip
with gzip.open(plot_value_from_txt_file,'rt') as in_f:
    for line in in_f:
        print(line.rstrip('\n'))

list转dataframe:

People_List = [['Jon','Smith',21],['Mark','Brown',38],['Maria','Lee',42],['Jill','Jones',28],['Jack','Ford',55]]
df = pd.DataFrame (People_List,columns=['First_Name','Last_Name','Age'])
print (df)

数据输出

out_file_name = file_name+'.txt'
out_file = os.path.join(out_dir,out_file_name)
df.to_csv(out_file,index='False')
## 为了节省空间,也可以输出到压缩文件
#compression following possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
out_file = file_name+'.gz'
df.to_csv(out_file,index='False',compression='gzip')
# 如果不用panda的gzip输出,可以使用gzip
import gzip
with gzip.open(plot_value_from_txt_file,'wt') as out_f:
    out_f.write(data[i+'\n')

数据操作:

loc

[x1,x2]第一个是行,第二个是列,可以使用切片或者索引 (:代表取所有行)

time_collect = df.loc[:,['yy','doy','thours']]

取数据长度

函数	描述
df.info() 	显示行数、列数等（查看详情：pandas info()函数）
len(df)	获取行数
len(df.columns)	获取列数
df.shape	获取行数和列数
df.size	得到元素个数

更改值

df1 = pd.DataFrame([['Snow','M',22],['Tyrion','M',32],['Sansa','F',18],['Arya','F',14]], columns=['name','gender','age'])
print("--------更换单个值----------")
# loc和iloc 可以更换单行、单列、多行、多列的值
df1.loc[0,'age']=25      # 思路：先用loc找到要更改的值，再用赋值（=）的方法实现更换值
df1.iloc[0,2]=25         # iloc：用索引位置来查找
# at 、iat只能更换单个值
df1.at[0,'age']=25      # iat 用来取某个单值,参数只能用数字索引
df1.iat[0,2]=25         # at 用来取某个单值,参数只能用index和columns索引名称

新插入行或列

df1 = pd.DataFrame([['Snow','M',22],['Tyrion','M',32],['Sansa','F',18],['Arya','F',14]], columns=['name','gender','age'])
print("----------在最后新增一列---------------")
print("-------案例1----------")
# 在数据框最后加上score一列，元素值分别为：80，98，67，90
df1['score']=[80,98,67,90]   # 增加列的元素个数要跟原数据列的个数一样
print(df1)
print("-------案例2----------")
print("---------在指定位置新增列:用insert（）--------")
# 在gender后面加一列城市
# 在具体某个位置插入一列可以用insert的方法
# 语法格式：列表.insert(index, obj)
# index --->对象 obj 需要插入的索引位置。
# obj ---> 要插入列表中的对象（列名）
col_name=df1.columns.tolist()                   # 将数据框的列名全部提取出来存放在列表里
print(col_name)
col_name.insert(2,'city')                      # 在列索引为2的位置插入一列,列名为:city，刚插入时不会有值，整列都是NaN
df1=df1.reindex(columns=col_name)              # DataFrame.reindex() 对原行/列索引重新构建索引值
df1['city']=['北京','山西','湖北','澳门']   # 给city列赋值
print(df1)
print("----------新增行---------------")
# 重要！！先创建一个DataFrame，用来增加进数据框的最后一行
new=pd.DataFrame({'name':'lisa',
                  'gender':'F',
                  'city':'北京',
                  'age':19,
                  'score':100},
                 index=[1])   # 自定义索引为：1 ，这里也可以不设置index
print(new)
print("-------在原数据框df1最后一行新增一行，用append方法------------")
df1=df1.append(new,ignore_index=True)   # ignore_index=True,表示不按原来的索引，从0开始自动递增
print(df1)

提取/去掉包含某类值的行:

# remove row when thours = 24.0
df = df[~df['thours'].isin([24.0])]
# 提取local_hour =0.0的行
df_for_cal_time=df[df['local_hour'].isin([0.0])]
Note:
df_for_cal_time = df_for_cal_time.reset_index(drop=True)
每次操作后,index没有变和原来的df一样,意味着Index是断断续续的,如果想要重置index,需要reset

筛选数据

按照列的条件筛选
　　df = pandas.DataFrame(...) # supposing it has 3 columns: a, b and c
　　df[(df['a'] > 0) & (df['b'] < 0) | df['c'] > 0]

四则运算

A = list(100*(B['tecint']-B['median_27_days'])/B['median_27_days'])
对每行的特定列进行加减乘除

滑动窗口

len_sliding_window = 27
df_tec_int = df.loc[:,["tecint"]]
median_value_lst = list(df_tec_int.rolling(len_sliding_window).median().shift(1)['tecint'])
滑动求27天的中位数,shift(1)代表求中位数时排除当前值,如果不加shift, 该滑动27天的窗口是包含了当前天的

drop

pandas.DataFrame删除/选取含有特定数值的行或列
1.删除/选取某列含有特殊数值的行
import pandas as pd
import numpy as np
a=np.array([[1,2,3],[4,5,6],[7,8,9]])
df1=pd.DataFrame(a,index=['row0','row1','row2'],columns=list('ABC'))
print(df1)
df2=df1.copy()
#删除/选取某列含有特定数值的行
#df1=df1[df1['A'].isin([1])]
#df1[df1['A'].isin([1])]  选取df1中A列包含数字1的行
df1=df1[~df1['A'].isin([1])]
#通过~取反，选取不包含数字1的行
print(df1)
--------------------- 
2.删除/选取某行含有特殊数值的列
#删除/选取某行含有特定数值的列
cols=[x for i,x in enumerate(df2.columns) if df2.iat[0,i]==3]
#利用enumerate对row0进行遍历，将含有数字3的列放入cols中
print(cols)
#df2=df2[cols]   选取含有特定数值的列
df2=df2.drop(cols,axis=1) #利用drop方法将含有特定数值的列删除
print(df2)
3.删除含有空值的行或列
实现思路：利用pandas.DateFrame.fillna对空值赋予特定值，再利用上文介绍的方法找到这些含有特定值的行或列去除即可。
import pandas as pd
import numpy as np
df1 = pd.DataFrame(
        [np.nan, 2, np.nan, 0],
        [3, 4, np.nan, 1],
        [np.nan, np.nan, np.nan, 5],
        [np.nan, 3, np.nan, 4]
    ],columns=list('ABCD'))
print(df1)
df2=df1.copy()
df1['A']=df1['A'].fillna('null') #将df中A列所有空值赋值为'null'
print(df1)
df1=df1[~df1['A'].isin(['null'])]
print(df1)
#删除某行空值所在列
df2[0:1]=df2[0:1].fillna('null')
print(df2)
cols=[x for i,x in enumerate(df2.columns) if df2.iat[0,i]=='null']
print(cols)
df2=df2.drop(cols,axis=1)
print(df2)

重置索引

1. df2.index = range(len(df2))
2. df2 = df2.reset_index(drop=True)
3. df2 = df2.reindex(labels=range(len(df))

ref: pandas中DataFrame重置设置索引 - Marketing123 - 博客园

多个dataframe操作

df_lst = [df1,df2]
df = pd.contact(df_lst,ignore_index=True)
#ref: https://www.datacamp.com/community/tutorials/joining-dataframes-pandas

匹配时间

# fixed interval: 1 hour
def time_lst_with_fixed_interval(start_date_time,end_date_time):
    try:
        datetime_lst = []
        current_datetime = start_date_time
        while (current_datetime<=end_date_time):
            if time_resolution == '15min':
                time_interval = datetime.timedelta(minutes=15)
            elif time_resolution == '1hour':
                time_interval = datetime.timedelta(hours=1)
            datetime_lst.append(current_datetime)
            current_datetime = current_datetime+time_interval
        return datetime_lst
    except:
        traceback.print_exc()
df = pd.read_csv(in_file)
if 'Unnamed: 0' in df.columns:
     df = df.iloc[:, 1:]
# loop and match by time
        for time in datetime_lst:
            #print('gathering data of ')
            print(time.strftime('%Y_%m_%d_%j_%H_%M')+'\n')
            P_nor_epoch_lst = []
            lon_epoch_lst = []
            lat_epoch_lst = []
            for i in range(len(df_lst)):
                current_df = df_lst[i]
                #show(current_df)
                # time_tmp = time.strftime('%Y-%m-%d %H:%M:%S\n')
                # print((time_tmp))
                # print((current_df.loc[0,'date_time']))
                info = current_df[current_df['date_time'].isin([time])]