python_data_analysis10
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Nov 3 13:09:18 2021 @author: lee """ import numpy as np import pandas as pd from keras.layers.core import Activation, Dense from keras.models import Sequential """ programmer_1-->简单的数据筛选,划分数据 programmer_2-->阈值寻优???不懂。。 programmer_3-->建立训练神经网络,并进行模型的检验 programmer_4-->根据几个特征推算出是否满足某项条件 event_num-->相邻时间的差分,比较是否大于阈值 """ def programmer_1(): # 阈值 threshold = pd.Timedelta("4 min") inputfile = "data/water_heater.xls" outputfile = "tmp/dividsequence.xls" data = pd.read_excel(inputfile) # dataframe处理 data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S") data = data[data[u"水流量"] > 0] # 流量大于0 d = data[u"发生时间"].diff() > threshold # 相邻时间作差分,大于threshold data[u"事件编号"] = d.cumsum() + 1 # 通过累积求和的方式为事件编号 data.to_excel(outputfile) # 相邻时间作差分,比较是否大于阈值 def programmer_2(): inputfile = "data/water_heater.xls" # 使用之后四个点的平均斜率 n = 4 # 专家阈值 threshold = pd.Timedelta(minutes=5) data = pd.read_excel(inputfile) data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S") data = data[data[u"水流量"] > 0] # 定义阈值列 dt = [pd.Timedelta(minutes=i) for i in np.arange(1, 9, 0.25)] h = pd.DataFrame(dt, columns=[u"阈值"]) def event_num(ts): d = data[u"发生时间"].diff() > ts # 返回事件数 return d.sum() + 1 # 计算每个阈值对应的事件数 h[u"事件数"] = h[u"阈值"].apply(event_num) # 计算每两个相邻点对应的斜率 h[u"斜率"] = h[u"事件数"].diff() / 0.25 # 采用后n个的斜率绝对值平均作为斜率指标 h[u"斜率指标"] = pd.Series.rolling(h[u"斜率"].abs(), n).mean() ts = h[u"阈值"][h[u"斜率指标"].idxmin() - n] if ts > threshold: ts = pd.Timedelta(minutes=4) print(ts) def programmer_3(): inputfile1 = "data/train_neural_network_data.xls" inputfile2 = "data/test_neural_network_data.xls" testoutputfile = "tmp/test_output_data.xls" # 读取训练集和测试集,并且划分样本特征和标签 data_train = pd.read_excel(inputfile1) data_test = pd.read_excel(inputfile2) y_train = data_train.iloc[:, 4].as_matrix() x_train = data_train.iloc[:, 5:17].as_matrix() y_test = data_test.iloc[:, 4].as_matrix() x_test = data_test.iloc[:, 5:17].as_matrix() # 建立神经网络模型 model = Sequential() model.add(Dense(17, input_shape=(11, ))) model.add(Activation("relu")) model.add(Dense(10, input_shape=(17, ))) model.add(Activation("relu")) model.add(Dense(1, input_shape=(10, ))) model.add(Activation("sigmoid")) # 编译模型 model.compile( loss="binary_crossentropy", optimizer="adam", sample_weight_mode="binary") # 训练模型 model.fit(x_train, y_train, nb_epoch=100, batch_size=1) # 保存模型 model.save_weights("tmp/net.model") # 进行预测 r = pd.DataFrame(model.predict_classes(x_test), columns=[u"预测结果"]) pd.concat([data_test.iloc[:, :5], r], axis=1).to_excel(testoutputfile) model.predict(x_test) return y_test def programmer_4(): threshold = pd.Timedelta("4 min") inputfile = "data/water_heater.xls" outputfile = "tmp/attribute_extract.xls" data = pd.read_excel(inputfile) data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S") data = data[data[u"水流量"] > 0] d = data[u"发生时间"].diff() > threshold data[u"事件编号"] = d.cumsum() + 1 data_g = data.groupby(u"事件编号") result = pd.DataFrame() dt = pd.Timedelta(seconds=2) for _, g in data_g: temp = pd.DataFrame(index=[0]) # 根据用水时长、开关机切换次数、总用水量推出是否是洗澡 tstart = g[u"发生时间"].min() tend = g[u"发生时间"].max() temp[u"用水事件时长(M)"] = (dt + tend - tstart).total_seconds() / 60 temp[u"开关机切换次数"] = (pd.Series.rolling(g[u"开关机状态"] == u"关", 2).sum() == 1).sum() temp[u"总用水量(L)"] = g[u"水流量"].sum() tdiff = g[u"发生时间"].diff() if len(g[u"发生时间"]) == 1: temp[u"总用水时长(Min)"] = dt.total_seconds() / 60 else: temp[u"总用水时长(Min)"] = ( tdiff.sum() - tdiff.iloc[1] / 2 - tdiff.iloc[len(tdiff) - 1] / 2).total_seconds() / 60 temp[u"平均水流量(L/min)"] = temp[u"总用水量(L)"] / temp[u"总用水时长(Min)"] result = result.append(temp, ignore_index=True) result.to_excel(outputfile) if __name__ == "__main__": # programmer_1() # programmer_2() # programmer_3() # programmer_4() pass
#携程暑期实习#
Python 文章被收录于专栏
Python由荷兰数学和计算机科学研究学会的吉多·范罗苏姆于1990年代初设计,作为一门叫做ABC语言的替代品。Python提供了高效的高级数据结构,还能简单有效地面向对象编程。Python语法和动态类型,以及解释型语言的本质,使它成为多数平台上写脚本和快速开发应用的编程语言,随着版本的不断更新和语言新功能的添加,逐渐被用于独立的、大型项目的开发