玉楼金阙慵归去
且插梅花醉洛阳
前言
之前为练手Scikit-Learn,使用了一套北京房价数据,由于初学了Tensorflow,便跃跃欲试,便在之前的基础上,利用神经网络练手那套数据了。
步骤
准备数据
沿用之前的数据清洗以及准备代码,得到待使用的数据集:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, StratifiedKFold
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.contrib.layers import fully_connected
from datetime import datetime
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
class DataNumCleaner(BaseEstimator, TransformerMixin):
def __init__(self, clean=True):
self.clean = clean
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if self.clean:
X = X[(X.constructionTime != '0') & (X.constructionTime != '1') & (X.constructionTime != '未知')]
X['constructionTime'] = 2018 - X['constructionTime'].astype('int64')
X = X[(X.buildingType == 1) | (X.buildingType == 2) | (X.buildingType == 3) | (X.buildingType == 4)]
X = X[X.livingRoom != '#NAME?']
X = X[(X.drawingRoom == '0') | (X.drawingRoom == '1') | (X.drawingRoom == '2') | (X.drawingRoom == '3') | (X.drawingRoom == '4') | (X.drawingRoom == '5')]
X = X[(X.bathRoom == '0') | (X.bathRoom == '1') | (X.bathRoom == '2') | (X.bathRoom == '3') | (X.bathRoom == '4') | (X.bathRoom == '5') | (X.bathRoom == '6') | (X.bathRoom == '7')]
X.bathRoom = X.bathRoom.astype('float64')
X.drawingRoom = X.drawingRoom.astype('float64')
X.livingRoom = X.livingRoom.astype('float64')
return X
else:
return X
class DataFrameSelector(BaseEstimator, TransformerMixin):
"""docstring for DataFrameSelector"""
def __init__(self, attribute_name):
self.attribute_name = attribute_name
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_name].values
def load_housing_data(file_path):
return pd.read_csv(file_path, sep=',', low_memory=False)
housing = load_housing_data('new.csv')
housing = housing.drop(['url','id','price','Cid','DOM','tradeTime', 'floor', 'followers', 'communityAverage'], axis=1)
spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in spliter.split(housing, housing['district']):
train_set = housing.loc[train_index]
test_set = housing.loc[test_index]
housing = train_set.copy()
num_attributes = ['Lng', 'Lat', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'constructionTime', 'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway']
cat_attributes = ['buildingType', 'renovationCondition', 'buildingStructure', 'district']
num_pipeline = Pipeline([
('cleaner', DataNumCleaner()),
('selector', DataFrameSelector(num_attributes)),
('imputer', Imputer(strategy='most_frequent')),
('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([
('cleaner', DataNumCleaner()),
('selector', DataFrameSelector(cat_attributes)),
('encoder', OneHotEncoder())
])
label_pipeline = Pipeline([
('cleaner', DataNumCleaner()),
('selector', DataFrameSelector(['totalPrice']))
])
full_pipeline = FeatureUnion([
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline)
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_label = label_pipeline.fit_transform(housing)
构建神经网络
-
准备TensorBoard文件,以时间为记录
now = datetime.utcnow().strftime('%Y%m%d%H%M%S') root_logdir = 'tf_logs' logdir = '{}/run-{}/'.format(root_logdir, now)
-
准备输入参数
n_inputs = 39 n_hidden1 = 300 n_hidden2 = 100 n_outputs = 1 X = tf.placeholder(tf.float32,shape=(None, n_inputs),name='X') y = tf.placeholder(tf.float32,shape=(None,n_outputs),name='y')
-
构建运算图
with tf.name_scope('dnn'): hidden1 = fully_connected(X, n_hidden1, scope='hidden1') hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2') logits = fully_connected(hidden2, n_outputs, scope='outputs', activation_fn=None) with tf.name_scope('loss'): loss = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.subtract(logits,y)))), name='loss') learning_rate = 0.01 with tf.name_scope('train'): optimizer = tf.train.GradientDescentOptimizer(learning_rate) training_op = optimizer.minimize(loss) with tf.name_scope('mark'): loss_change = tf.summary.scalar('loss',loss) file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())
鉴于硬件配置,我使用了包含2个隐藏层(300、100节点)的神经网络,使用RMSE作为损失函数,训练200次,每10次记录一下。
训练
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
#saver.restore(sess,'housing.ckpt')
init.run()
for epoch in range(200):
X_train, X_test, y_train, y_test = train_test_split(housing_prepared, housing_label, test_size=0.2)
X_train = X_train.todense()
X_test = X_test.todense()
sess.run(training_op, feed_dict={X: X_train, y: y_train})
epoch_loss = loss.eval(feed_dict={X: X_train, y: y_train})
y_pred = logits.eval(feed_dict={X: X_train, y: y_train})
mse = mean_squared_error(y_train,y_pred)
rmse = np.sqrt(mse)
print('Epoch: ', epoch, 'RMSE: ', epoch_loss, 'Test RMSE: ', rmse)
if epoch % 10 == 0:
loss_mark = loss_change.eval(feed_dict={X: X_train, y: y_train})
file_writer.add_summary(loss_mark,epoch)
save_path = saver.save(sess, 'housing.ckpt')
查看训练情况
tensorboard --logdir tf_logs/
可见loss函数逐渐收敛,但是loss值依然很高,依赖于之后的调参或运算图的重新构建以及训练细节了。
使用模型
with tf.Session() as sess:
saver.restore(sess,'housing.ckpt')
test_pred = logits.eval(feed_dict={X: X_test_housing.todense()})
res_rmse = np.sqrt(mean_squared_error(y_test_housing,test_pred))