|
REDD数据预处理python程序
- # -*- coding: utf-8 -*-import pandas as pd
- import matplotlib.pyplot as plt
- import time
- import argparse
- import os
- params_appliance ={'microwave':{'windowlength':599,'on_power_threshold':200,'max_on_power':3969,'mean':500,'std':800,'s2s_length':128,'houses':[1,2,3],'channels':[11,6,16],'train_build':[2,3],'test_build':1},'fridge':{'windowlength':599,'on_power_threshold':50,'max_on_power':3323,'mean':200,'std':400,'s2s_length':512,'houses':[1,2,3],'channels':[5,9,7],'train_build':[2,3],'test_build':1},'dishwasher':{'windowlength':599,'on_power_threshold':10,'max_on_power':3964,'mean':700,'std':1000,'s2s_length':1536,'houses':[1,2,3],'channels':[6,10,9],'train_build':[2,3],'test_build':1},'washingmachine':{'windowlength':599,'on_power_threshold':20,'max_on_power':3999,'mean':400,'std':700,'s2s_length':2000,'houses':[1,2,3],# 'channels': [19, 7, 13],'channels':[20,7,13],'train_build':[2,3],'test_build':1}}# APPLIANCE_NAME = 'washingmachine'# APPLIANCE_NAME = 'fridge'# APPLIANCE_NAME = 'dishwasher'
- APPLIANCE_NAME ='microwave'
- aggregate_mean =522
- aggregate_std =814
- start_time = time.time()
- data_dir ='redd/low_freq'
- sample_seconds =8
- validation_percent =15
- nrows =None
- appliance_names =["microwave","fridge","dishwasher","washingmachine"]for appliance_name in appliance_names:print('--'*20)print('\n'+ appliance_name)
- train = pd.DataFrame(columns=['aggregate', appliance_name])
- save_path ='created_data/REDD/{}/'.format(appliance_name)ifnot os.path.exists(save_path):#如果路径不存在
- os.makedirs(save_path)for h in params_appliance[appliance_name]['houses']:print(' '+ data_dir +'/house_'+str(h)+'/'+'channel_'+str(params_appliance[appliance_name]['channels'][params_appliance[appliance_name]['houses'].index(h)])+'.dat')# read data
- mains1_df = pd.read_table(data_dir +'/'+'house_'+str(h)+'/'+'channel_'+str(1)+'.dat',
- sep="\s+",
- nrows=nrows,
- usecols=[0,1],
- names=['time','mains1'],
- dtype={'time':str},)
-
- mains2_df = pd.read_table(data_dir +'/'+'house_'+str(h)+'/'+'channel_'+str(2)+'.dat',
- sep="\s+",
- nrows=nrows,
- usecols=[0,1],
- names=['time','mains2'],
- dtype={'time':str},)
- app_df = pd.read_table(data_dir +'/'+'house_'+str(h)+'/'+'channel_'+str(params_appliance[appliance_name]['channels'][params_appliance[appliance_name]['houses'].index(h)])+'.dat',
- sep="\s+",
- nrows=nrows,
- usecols=[0,1],
- names=['time', appliance_name],
- dtype={'time':str},)
-
-
- mains1_df['time']= pd.to_datetime(mains1_df['time'], unit='s')
- mains2_df['time']= pd.to_datetime(mains2_df['time'], unit='s')
-
- mains1_df.set_index('time', inplace=True)
- mains2_df.set_index('time', inplace=True)
-
- mains_df = mains1_df.join(mains2_df, how='outer')
-
- mains_df['aggregate']= mains_df.iloc[:].sum(axis=1)#resample = mains_df.resample(str(sample_seconds) + 'S').mean()
-
- mains_df.reset_index(inplace=True)# deleting original separate mainsdel mains_df['mains1'], mains_df['mains2']
-
- app_df['time']= pd.to_datetime(app_df['time'], unit='s')
- mains_df.set_index('time', inplace=True)
- app_df.set_index('time', inplace=True)
-
- df_align = mains_df.join(app_df, how='outer'). \
- resample(str(sample_seconds)+'S').mean().fillna(method='backfill', limit=1)
- df_align = df_align.dropna()
-
- df_align.reset_index(inplace=True)#print(df_align.count())# df_align['OVER 5 MINS'] = (df_align['time'].diff()).dt.seconds > 9# df_align.plot()# plt.plot(df_align['OVER 5 MINS'])# plt.show()del mains1_df, mains2_df, mains_df, app_df, df_align['time']
-
- mains = df_align['aggregate'].values
- app_data = df_align[appliance_name].values
- # plt.plot(np.arange(0, len(mains)), mains, app_data)# plt.show()# if debug:# # plot the dtaset# print("df_align:")# print(df_align.head())# plt.plot(df_align['aggregate'].values)# plt.plot(df_align[appliance_name].values)# plt.show()# Normilization
- mean = params_appliance[appliance_name]['mean']
- std = params_appliance[appliance_name]['std']
-
- df_align['aggregate']=(df_align['aggregate']- aggregate_mean)/ aggregate_std
- df_align[appliance_name]=(df_align[appliance_name]- mean)/ std
-
- if h == params_appliance[appliance_name]['test_build']:# Test CSV
- df_align.to_csv(save_path + appliance_name +'_test_.csv', mode='a', index=False, header=False)print(" Size of test set is {:.4f} M rows.".format(len(df_align)/10**6))continue
-
- train = train.append(df_align, ignore_index=True)del df_align
-
- # Validation CSV
- val_len =int((len(train)/100)*validation_percent)
- val = train.tail(val_len)
- val.reset_index(drop=True, inplace=True)
- train.drop(train.index[-val_len:], inplace=True)
- val.to_csv(save_path + appliance_name +'_validation_'+'.csv', mode='a', index=False, header=False)# Training CSV
- train.to_csv(save_path + appliance_name +'_training_.csv', mode='a', index=False, header=False)
复制代码 数据下载链接: 非入侵负荷分解数据集下载
【1】只做针对REDD的低频数据集 |
|