## Load the libraries
import os
import numpy as np
import pandas as pd
# data viz
import matplotlib.pyplot as plt
import seaborn as sns
# machine learning
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import multiprocessing
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
## Load the dataset into a pandas data frame
airbnb_train = pd.read_csv(os.path.abspath('train_users_2.csv'))
airbnb_test = pd.read_csv(os.path.abspath('test_users.csv'))
airbnb_sessions = pd.read_csv(os.path.abspath('sessions.csv'))
print("There are", airbnb_train.shape[0], "users and", airbnb_train.shape[1], "features in the training set and", airbnb_test.shape[0], "users and", airbnb_test.shape[1], "features in the test set.")
There are 213451 users and 16 features in the training set and 62096 users and 15 features in the test set.
print("Total users in training and test sets are:", airbnb_train.shape[0]+airbnb_test.shape[0])
Total users in training and test sets are: 275547
print("There are", airbnb_sessions.shape[0], "users and", airbnb_sessions.shape[1], "features in the session set")
There are 10567737 users and 6 features in the session set
Grouping the session data based upon the action, action_type, and action_detail to get the number of actions per user and to group them according to ensure trainability
session1 = airbnb_sessions.groupby(["user_id","action"]).agg({"action": "count"})
session2 = airbnb_sessions.groupby(["user_id","action_type"]).agg({"action_type": "count"})
session3 = airbnb_sessions.groupby(["user_id","action_detail"]).agg({"action_detail": "count"})
session1 = session1.rename(columns={'action':'action_count'}).reset_index()
session2 = session2.rename(columns={'action_type':'action_type_count'}).reset_index()
session3 = session3.rename(columns={'action_detail':'action_detail_count'}).reset_index()
Now we need to pivot the data in the broken down session data to make the data wider and more readable by the machine learning models
session1 = session1.pivot_table(index='user_id', columns='action', values='action_count').fillna(0)
session2 = session2.pivot_table(index='user_id', columns='action_type', values='action_type_count').fillna(0)
session3 = session3.pivot_table(index='user_id', columns='action_detail', values='action_detail_count').fillna(0)
session_pivot = session1.merge(session2, on='user_id', how='outer').merge(session3, on='user_id', how='outer')
session_pivot.head()
10 | 11 | 12 | 15 | about_us | accept_decline | account | acculynk_bin_check_failed | acculynk_bin_check_success | acculynk_load_pin_pad | ... | view_resolutions | view_search_results | view_security_checks | view_user_real_names | wishlist | wishlist_content_update | wishlist_note | your_listings | your_reservations | your_trips | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
00023iyk9l | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 2.0 |
0010k6l0om | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 |
001wyh0pz8 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 66.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
0028jgx1x1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
002qnbzfs5 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 125.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 524 columns
airbnb_df = airbnb_train.merge(session_pivot, left_on='id', right_on='user_id', how='inner')
print("There are", airbnb_df.shape[0],"observations and", airbnb_df.shape[1], "features in the dataset")
There are 73815 observations and 540 features in the dataset
airbnb_df.head()
id | date_account_created | timestamp_first_active | date_first_booking | gender | age | signup_method | signup_flow | language | affiliate_channel | ... | view_resolutions | view_search_results | view_security_checks | view_user_real_names | wishlist | wishlist_content_update | wishlist_note | your_listings | your_reservations | your_trips | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | d1mm9tcy42 | 2014-01-01 | 20140101000936 | 2014-01-04 | MALE | 62.0 | basic | 0 | en | sem-non-brand | ... | 0.0 | 23.0 | 0.0 | 0.0 | 0.0 | 25.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | yo8nz8bqcq | 2014-01-01 | 20140101001558 | NaN | -unknown- | NaN | basic | 0 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 4grx6yxeby | 2014-01-01 | 20140101001639 | NaN | -unknown- | NaN | basic | 0 | en | sem-brand | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | ncf87guaf0 | 2014-01-01 | 20140101002146 | NaN | -unknown- | NaN | basic | 0 | en | direct | ... | 0.0 | 32.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4rvqpxoh3h | 2014-01-01 | 20140101002619 | 2014-01-02 | -unknown- | NaN | basic | 25 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 540 columns
By looking at the data, we can see that gender
and first browser
columns have unknown
values. Replacing unknown
with NaN
.
airbnb_df.replace('-unknown-', np.nan, inplace=True)
airbnb_df.head()
id | date_account_created | timestamp_first_active | date_first_booking | gender | age | signup_method | signup_flow | language | affiliate_channel | ... | view_resolutions | view_search_results | view_security_checks | view_user_real_names | wishlist | wishlist_content_update | wishlist_note | your_listings | your_reservations | your_trips | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | d1mm9tcy42 | 2014-01-01 | 20140101000936 | 2014-01-04 | MALE | 62.0 | basic | 0 | en | sem-non-brand | ... | 0.0 | 23.0 | 0.0 | 0.0 | 0.0 | 25.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | yo8nz8bqcq | 2014-01-01 | 20140101001558 | NaN | NaN | NaN | basic | 0 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 4grx6yxeby | 2014-01-01 | 20140101001639 | NaN | NaN | NaN | basic | 0 | en | sem-brand | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | ncf87guaf0 | 2014-01-01 | 20140101002146 | NaN | NaN | NaN | basic | 0 | en | direct | ... | 0.0 | 32.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4rvqpxoh3h | 2014-01-01 | 20140101002619 | 2014-01-02 | NaN | NaN | basic | 25 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 540 columns
## Checking for missing value
airbnb_df.isnull().sum()
id 0 date_account_created 0 timestamp_first_active 0 date_first_booking 45041 gender 37788 ... wishlist_content_update 3 wishlist_note 3 your_listings 3 your_reservations 3 your_trips 3 Length: 540, dtype: int64
We will see the distribution of target variable country_destination
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='country_destination', data=airbnb_df)
plt.xlabel('Destination Country')
plt.ylabel('Number of users')
for p in plot.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width()
y = p.get_height()
plot.annotate(percentage, (x, y),ha='center')
The data in the above graph shows that 61% of users did not book any destination (NDF-No destination found) and US(27%) seems to be a most booked country. Since all users in the dataset are from US, it shows that US travlers most likely to travel within US.
Here we are only considering users who made at least one reservation, therefore we need to exclude NDF from the target column country_destination
.
airbnb_df = airbnb_df[airbnb_df['country_destination']!='NDF']
airbnb_df.age.describe()
count 22700.000000 mean 41.917753 std 106.748982 min 15.000000 25% 27.000000 50% 32.000000 75% 41.000000 max 2014.000000 Name: age, dtype: float64
The maximum age is 2014 which could not be possible. It looks like a year. We need to change the age column from year to age. Also need to set the limit for age like minimum age and maximum age.
airbnb_df_with_year = airbnb_df['age'] > 1000
airbnb_df.loc[airbnb_df_with_year, 'age'] = 2015 - airbnb_df.loc[airbnb_df_with_year, 'age']
airbnb_df.loc[airbnb_df.age > 95, 'age'] = np.nan
airbnb_df.loc[airbnb_df.age < 16, 'age'] = np.nan
airbnb_df['age'].fillna(-1, inplace=True)
airbnb_df.head()
id | date_account_created | timestamp_first_active | date_first_booking | gender | age | signup_method | signup_flow | language | affiliate_channel | ... | view_resolutions | view_search_results | view_security_checks | view_user_real_names | wishlist | wishlist_content_update | wishlist_note | your_listings | your_reservations | your_trips | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | d1mm9tcy42 | 2014-01-01 | 20140101000936 | 2014-01-04 | MALE | 62.0 | basic | 0 | en | sem-non-brand | ... | 0.0 | 23.0 | 0.0 | 0.0 | 0.0 | 25.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4rvqpxoh3h | 2014-01-01 | 20140101002619 | 2014-01-02 | NaN | -1.0 | basic | 25 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6 | xwxei6hdk4 | 2014-01-01 | 20140101002742 | 2014-01-07 | FEMALE | 32.0 | 0 | en | seo | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
8 | ro2stddszp | 2014-01-01 | 20140101005503 | 2014-12-04 | NaN | 19.0 | basic | 0 | en | sem-brand | ... | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10 | awiurksqr3 | 2014-01-01 | 20140101010113 | 2014-01-02 | FEMALE | 32.0 | 0 | en | direct | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 540 columns
# Cast date time column to property date time format and Split dates into day, week, month, year.
airbnb_df["date_account_created"] = pd.to_datetime(airbnb_df["date_account_created"])
airbnb_df["date_account_created_day"] = airbnb_df.date_account_created.dt.weekday
airbnb_df["date_account_created_month"] = airbnb_df.date_account_created.dt.month
airbnb_df["date_account_created_year"] = airbnb_df.date_account_created.dt.year
# Cast date time column to property date time format and split dates into day, week, month, year.
airbnb_df['timestamp_first_active'] = pd.to_datetime(airbnb_df['timestamp_first_active'], format='%Y%m%d%H%M%S')
airbnb_df["timestamp_first_active_day"] = airbnb_df.timestamp_first_active.dt.weekday
airbnb_df["timestamp_first_active_month"] = airbnb_df.timestamp_first_active.dt.month
airbnb_df["timestamp_first_active_year"] = airbnb_df.timestamp_first_active.dt.year
airbnb_df["timestamp_first_active_hour"] = airbnb_df.timestamp_first_active.dt.hour
# Make the split datatime fields into object data types for model training
airbnb_df["date_account_created_day"] = airbnb_df["date_account_created_day"].astype("O")
airbnb_df["date_account_created_month"] = airbnb_df["date_account_created_month"].astype("O")
airbnb_df["date_account_created_year"] = airbnb_df["date_account_created_year"].astype("O")
airbnb_df["timestamp_first_active_day"] = airbnb_df["timestamp_first_active_day"].astype("O")
airbnb_df["timestamp_first_active_month"] = airbnb_df["timestamp_first_active_month"].astype("O")
airbnb_df["timestamp_first_active_year"] = airbnb_df["timestamp_first_active_year"].astype("O")
airbnb_df["timestamp_first_active_hour"] = airbnb_df["timestamp_first_active_hour"].astype("O")
# Change the signup_flow to an object data type
airbnb_df["signup_flow"] = airbnb_df["signup_flow"].astype("O")
# Dropping these columns because they have already been altered and don't need duplicate data types
drop_cols = ["date_account_created","timestamp_first_active","date_first_booking", "age", "id"]
airbnb_df.drop(columns=drop_cols, inplace=True)
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_day', data=airbnb_df)
plt.xlabel('Day of Week')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
for p in plot.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width()
y = p.get_height()
plot.annotate(percentage, (x, y),ha='center')
Depending on the day of week, there are fluctuations in when customers are getting onto the Airbnb site. Tuesday is the peak at 16.1% of the total dataset. With the lowest value being Friday at 11.5% of the total dataset.
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_month', data=airbnb_df)
plt.xlabel('Month of Year')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5], ['January', 'February', 'March', 'April', 'May', 'June'])
for p in plot.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width()
y = p.get_height()
plot.annotate(percentage, (x, y),ha='center')
The above plot shows the relationship between the month of year and the number of users that first started using the Airbnb application. There is an upward tick from January to June in this dataset which would lead us to believe that there is a relationship between month of year and number of new users.
# Getting categorical datasets ready for One Hot Encoding
cat_cols = [col for col in airbnb_df.columns if airbnb_df[col].dtype =="O"]
cat_cols.remove("country_destination")
print(cat_cols)
# One Hot Encoding for the category datasets
train_df = pd.get_dummies(airbnb_df, columns=cat_cols, drop_first=True)
train_df.info()
['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'date_account_created_day', 'date_account_created_month', 'date_account_created_year', 'timestamp_first_active_day', 'timestamp_first_active_month', 'timestamp_first_active_year', 'timestamp_first_active_hour'] <class 'pandas.core.frame.DataFrame'> Int64Index: 28774 entries, 0 to 73809 Columns: 666 entries, country_destination to timestamp_first_active_hour_23 dtypes: float64(524), object(1), uint8(141) memory usage: 119.3+ MB
train_df.reset_index(inplace=True)
train_df.fillna(-1, inplace=True)
from sklearn.preprocessing import LabelEncoder
y = train_df['country_destination']
X = train_df.drop(columns='country_destination')
# Get unique values of the column
unique_values = train_df['country_destination'].unique()
# Initialize and fit the LabelEncoder
le = LabelEncoder()
le.fit(unique_values)
# Transform the target variable using labels
y_xgb = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.25, random_state=12)
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split= 50)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of decision tree model is {:.5f}'.format(accuracy))
precision recall f1-score support AU 0.00 0.00 0.00 42 CA 0.00 0.00 0.00 115 DE 0.00 0.00 0.00 64 ES 0.00 0.00 0.00 180 FR 0.50 0.01 0.01 378 GB 0.00 0.00 0.00 206 IT 0.08 0.00 0.01 228 NL 0.00 0.00 0.00 53 PT 0.00 0.00 0.00 21 US 0.70 1.00 0.82 5016 other 0.00 0.00 0.00 891 accuracy 0.70 7194 macro avg 0.12 0.09 0.08 7194 weighted avg 0.52 0.70 0.57 7194 Accuracy of decision tree model is 0.69739
randomforest_model = RandomForestClassifier(n_estimators = 500, criterion='gini', max_depth=6)
randomforest_model.fit(X_train, y_train)
y_pred = randomforest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of random forest model is {:.5f}'.format(accuracy))
precision recall f1-score support AU 0.00 0.00 0.00 42 CA 0.00 0.00 0.00 115 DE 0.00 0.00 0.00 64 ES 0.00 0.00 0.00 180 FR 0.00 0.00 0.00 378 GB 0.00 0.00 0.00 206 IT 0.00 0.00 0.00 228 NL 0.00 0.00 0.00 53 PT 0.00 0.00 0.00 21 US 0.70 1.00 0.82 5016 other 0.00 0.00 0.00 891 accuracy 0.70 7194 macro avg 0.06 0.09 0.07 7194 weighted avg 0.49 0.70 0.57 7194 Accuracy of random forest model is 0.69725
X_train, X_test, y_train, y_test = train_test_split(X, y_xgb, test_size =0.25, random_state=12)
print("Parallel Parameter optimization")
xgb_model = xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)
Parallel Parameter optimization Fitting 5 folds for each of 9 candidates, totalling 45 fits 0.6987951807228916 {'max_depth': 2, 'n_estimators': 100}
After running the above code we got the max_depth of 4 and the n_estimators equal to 50
xgb_model = xgb.XGBClassifier(max_depth=2, n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of XGBoost Model is {:.5f}'.format(accuracy))
precision recall f1-score support 0 0.00 0.00 0.00 42 1 0.00 0.00 0.00 115 2 0.00 0.00 0.00 64 3 0.00 0.00 0.00 180 4 0.40 0.01 0.02 378 5 0.00 0.00 0.00 206 6 0.20 0.00 0.01 228 7 0.00 0.00 0.00 53 8 0.00 0.00 0.00 21 9 0.70 1.00 0.82 5016 10 0.44 0.01 0.02 891 accuracy 0.70 7194 macro avg 0.16 0.09 0.08 7194 weighted avg 0.57 0.70 0.58 7194 Accuracy of XGBoost Model is 0.69794