1.导入依赖包
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
2.导入数据集
train = pd.read_csv('./data/bike_train.csv')
test = pd.read_csv('./data/bike_test.csv')
3.数据预处理
3.1查看数据的缺失值数目和数据类型
print(train.isnull().sum().sort_values(ascending=False))
print(test.isnull().sum().sort_values(ascending=False))
print(train.info())
print(test.info())
3.2将数据转化为DataFrame类型
train.datetime = pd.to_datetime(train.datetime)
test.datetime = pd.to_datetime(test.datetime)
print(train.info())
print(test.info())
3.3重新定义数据集
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['week'] = train['datetime'].dt.week
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['week'] = test['datetime'].dt.week
print(train.tail(3))
print(test.tail(3))
3.4查看各字段数据的相关性
plt.figure(figsize=(16, 8))
sns.heatmap(train.corr(), annot=True)
plt.show()
3.5查看各数目出现的次数
plt.figure(figsize=(16, 8))
sns.distplot(train['count'])
plt.show()
3.6查看数目的变化情况
plt.figure(figsize=(16, 8))
plt.plot(train['datetime'][0:500], train['count'][0:500])
plt.show()
3.7查看数据的统计性描述
plt.hist(x='workingday', data=train)
plt.figure(figsize=(16, 8))
sns.boxplot(x='season', y='count', data=train)
plt.figure(figsize=(16, 8))
sns.boxplot(x='week', y='count', data=train)
plt.figure(figsize=(16, 8))
sns.boxplot(x='hour', y='count', data=train)
plt.figure(figsize=(16, 8))
sns.boxplot(x='year', y='count', data=train)
plt.figure(figsize=(16, 8))
plt.hist(train['count'][train['year'] == 2011], alpha=0.5, label='2011')
plt.hist(train['count'][train['year'] == 2012],alpha=0.5,label='2012',color='red')
plt.scatter(train['hour'], train['count'])
print(train.head(3))
3.8删除datetime这一列
del train['datetime']
Q1 = train.quantile(0.25)
Q3 = train.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
train_wind = train[~((train < (Q1 - 1.5 * IQR)) |
(train > (Q3 + 1.5 * IQR))).any(axis=1)]
train_wind.dropna(inplace=True)
print(train.info())
print(train_wind.info())
print(train_wind.head(3))
plt.figure(figsize=(12, 7))
sns.boxplot(x='season', y='windspeed', data=train_wind, palette='winter')
def wind(cols):
windspeed = cols[0]
season = cols[1]
if windspeed == 0:
if season == 1:
return 14
elif season == 2:
return 14
else:
return 13
else:
return windspeed
train_wind['wind'] = train_wind[['windspeed', 'season']].apply(wind, axis=1)
test['wind'] = test[['windspeed', 'season']].apply(wind, axis=1)
print(test.head(3))
print(train_wind.head(3))
train_wind[['season', 'holiday', 'workingday', 'weather', 'year', 'month',
'day', 'hour', 'week']] = train_wind[['season', 'holiday',
'workingday', 'weather',
'year', 'month', 'day', 'hour',
'week']].astype('category')
test[['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'hour',
'week']] = test[['season', 'holiday', 'workingday', 'weather', 'year',
'month', 'day', 'hour','week']].astype('category')
print(train_wind.info())
X = train_wind[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
'humidity', 'year', 'month', 'day', 'hour', 'week', 'wind']]
y = train_wind['count']
3.9切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
3.10数据归一化
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)
sc_X = MinMaxScaler()
sc_y = MinMaxScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
y_train = sc_X.fit_transform(y_train)
y_test = sc_y.fit_transform(y_test)
sc_X = MinMaxScaler()
sc_X.fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = MinMaxScaler()
sc_y.fit(y_train)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
4.定义模型
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf_prediction = rf.predict(X_test)
print('MSE:', metrics.mean_squared_error(y_test, rf_prediction))
5.数据可视化和训练模型
plt.scatter(y_test, rf_prediction)
plt.figure(figsize=(16, 8))
plt.plot(rf_prediction[0:200], 'r')
plt.plot(y_test[0:200])
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
dt_prediction = dt_reg.predict(X_test)
print('MSE:', metrics.mean_squared_error(y_test, dt_prediction))
plt.scatter(y_test, dt_prediction)
print(test.head(3))
test[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
'humidity', 'year', 'month', 'day', 'hour', 'week',
'wind']] =sc_X.fit_transform(test[['season', 'holiday',
'workingday', 'weather', 'temp', 'atemp',
'humidity', 'year', 'month', 'day',
'hour', 'week', 'wind']])
test_pred = rf.predict(test[['season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'year', 'month', 'day',
'hour', 'week', 'wind']])
print(test_pred)
test_pred = test_pred.reshape(-1, 1)
test_pred = sc_y.inverse_transform(test_pred)
test_pred = pd.DataFrame(test_pred, columns=['count'])
df = pd.concat([test['datetime'], test_pred], axis=1)
print(df.head(3))
6.将结果写入预测文件中
df['count'] = df['count'].astype('int')
df.to_csv('submission.csv', index=False)
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论,也可以邮件至 2621041184@qq.com