北京房价再预测

基于神经网络

Posted by Gavin on June 14, 2019

玉楼金阙慵归去

且插梅花醉洛阳

前言

之前为练手Scikit-Learn,使用了一套北京房价数据,由于初学了Tensorflow,便跃跃欲试,便在之前的基础上,利用神经网络练手那套数据了。


步骤

准备数据

沿用之前的数据清洗以及准备代码,得到待使用的数据集:

import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, StratifiedKFold
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.contrib.layers import fully_connected
from datetime import datetime
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion


class DataNumCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, clean=True):
        self.clean = clean
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.clean:
            X = X[(X.constructionTime != '0') & (X.constructionTime != '1') & (X.constructionTime != '未知')]
            X['constructionTime'] = 2018 - X['constructionTime'].astype('int64')
            X = X[(X.buildingType == 1) | (X.buildingType == 2) | (X.buildingType == 3) | (X.buildingType == 4)]
            X = X[X.livingRoom != '#NAME?']
            X = X[(X.drawingRoom == '0') | (X.drawingRoom == '1') | (X.drawingRoom == '2') | (X.drawingRoom == '3') | (X.drawingRoom == '4') | (X.drawingRoom == '5')]
            X = X[(X.bathRoom == '0') | (X.bathRoom == '1') | (X.bathRoom == '2') | (X.bathRoom == '3') | (X.bathRoom == '4') | (X.bathRoom == '5') | (X.bathRoom == '6') | (X.bathRoom == '7')]
            X.bathRoom = X.bathRoom.astype('float64')
            X.drawingRoom = X.drawingRoom.astype('float64')
            X.livingRoom = X.livingRoom.astype('float64')
            return X
        else:
            return X

class DataFrameSelector(BaseEstimator, TransformerMixin):
    """docstring for DataFrameSelector"""
    def __init__(self, attribute_name):
        self.attribute_name = attribute_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_name].values

def load_housing_data(file_path):
    return pd.read_csv(file_path, sep=',', low_memory=False)


housing = load_housing_data('new.csv')
housing = housing.drop(['url','id','price','Cid','DOM','tradeTime', 'floor', 'followers', 'communityAverage'], axis=1)

spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in spliter.split(housing, housing['district']):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

housing = train_set.copy()

num_attributes = ['Lng', 'Lat', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'constructionTime',  'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway']
cat_attributes = ['buildingType', 'renovationCondition', 'buildingStructure', 'district']

num_pipeline = Pipeline([
    ('cleaner', DataNumCleaner()),
    ('selector', DataFrameSelector(num_attributes)),
    ('imputer', Imputer(strategy='most_frequent')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('cleaner', DataNumCleaner()),
    ('selector', DataFrameSelector(cat_attributes)),
    ('encoder', OneHotEncoder())
])

label_pipeline = Pipeline([
    ('cleaner', DataNumCleaner()),
    ('selector', DataFrameSelector(['totalPrice']))
])

full_pipeline = FeatureUnion([
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_label = label_pipeline.fit_transform(housing)

构建神经网络

  1. 准备TensorBoard文件,以时间为记录

     now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
     root_logdir = 'tf_logs'
     logdir = '{}/run-{}/'.format(root_logdir, now)
    
  2. 准备输入参数

     n_inputs = 39
     n_hidden1 = 300
     n_hidden2 = 100
     n_outputs = 1
    	
     X = tf.placeholder(tf.float32,shape=(None, n_inputs),name='X')
     y = tf.placeholder(tf.float32,shape=(None,n_outputs),name='y')
    
  3. 构建运算图

     with tf.name_scope('dnn'):
         hidden1 = fully_connected(X, n_hidden1, scope='hidden1')
         hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2')
         logits = fully_connected(hidden2, n_outputs, scope='outputs', activation_fn=None)
    	
     with tf.name_scope('loss'):
         loss = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.subtract(logits,y)))), name='loss')
    	
     learning_rate = 0.01
     with tf.name_scope('train'):
         optimizer = tf.train.GradientDescentOptimizer(learning_rate)
         training_op = optimizer.minimize(loss)
    	
     with tf.name_scope('mark'):
         loss_change = tf.summary.scalar('loss',loss)
         file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())
    

    鉴于硬件配置,我使用了包含2个隐藏层(300、100节点)的神经网络,使用RMSE作为损失函数,训练200次,每10次记录一下。

训练

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    #saver.restore(sess,'housing.ckpt')
    init.run()
    for epoch in range(200):
        X_train, X_test, y_train, y_test = train_test_split(housing_prepared, housing_label, test_size=0.2)
        X_train = X_train.todense()
        X_test = X_test.todense()
        sess.run(training_op, feed_dict={X: X_train, y: y_train})
        epoch_loss = loss.eval(feed_dict={X: X_train, y: y_train})
        y_pred = logits.eval(feed_dict={X: X_train, y: y_train})
        mse = mean_squared_error(y_train,y_pred)
        rmse = np.sqrt(mse)
        print('Epoch: ', epoch, 'RMSE: ', epoch_loss, 'Test RMSE: ', rmse)
        if epoch % 10 == 0:
            loss_mark = loss_change.eval(feed_dict={X: X_train, y: y_train})
            file_writer.add_summary(loss_mark,epoch)
            save_path = saver.save(sess, 'housing.ckpt')

查看训练情况

tensorboard --logdir tf_logs/

可见loss函数逐渐收敛,但是loss值依然很高,依赖于之后的调参或运算图的重新构建以及训练细节了。

使用模型

with tf.Session() as sess:
    saver.restore(sess,'housing.ckpt')
    test_pred = logits.eval(feed_dict={X: X_test_housing.todense()})
res_rmse = np.sqrt(mean_squared_error(y_test_housing,test_pred))