In [1]:
## Load the libraries
import os
import numpy as np 
import pandas as pd 

# data viz
import matplotlib.pyplot as plt 
import seaborn as sns

# machine learning
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import multiprocessing

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
In [2]:
## Load the dataset into a pandas data frame
airbnb_train = pd.read_csv(os.path.abspath('train_users_2.csv'))
airbnb_test = pd.read_csv(os.path.abspath('test_users.csv'))
airbnb_sessions = pd.read_csv(os.path.abspath('sessions.csv'))

Exploratory analysis¶

In [3]:
print("There are", airbnb_train.shape[0], "users and",  airbnb_train.shape[1], "features in the training set and", airbnb_test.shape[0], "users and", airbnb_test.shape[1], "features in the test set.")
There are 213451 users and 16 features in the training set and 62096 users and 15 features in the test set.
In [4]:
print("Total users in training and test sets are:", airbnb_train.shape[0]+airbnb_test.shape[0])
Total users in training and test sets are: 275547
In [5]:
print("There are", airbnb_sessions.shape[0], "users and", airbnb_sessions.shape[1], "features in the session set")
There are 10567737 users and 6 features in the session set

Grouping the session data based upon the action, action_type, and action_detail to get the number of actions per user and to group them according to ensure trainability

In [6]:
session1 = airbnb_sessions.groupby(["user_id","action"]).agg({"action": "count"})
session2 = airbnb_sessions.groupby(["user_id","action_type"]).agg({"action_type": "count"})
session3 = airbnb_sessions.groupby(["user_id","action_detail"]).agg({"action_detail": "count"})  

session1 = session1.rename(columns={'action':'action_count'}).reset_index()
session2 = session2.rename(columns={'action_type':'action_type_count'}).reset_index()
session3 = session3.rename(columns={'action_detail':'action_detail_count'}).reset_index()

Now we need to pivot the data in the broken down session data to make the data wider and more readable by the machine learning models

In [7]:
session1 = session1.pivot_table(index='user_id', columns='action', values='action_count').fillna(0)
session2 = session2.pivot_table(index='user_id', columns='action_type', values='action_type_count').fillna(0)
session3 = session3.pivot_table(index='user_id', columns='action_detail', values='action_detail_count').fillna(0)

session_pivot = session1.merge(session2, on='user_id', how='outer').merge(session3, on='user_id', how='outer')
In [8]:
session_pivot.head()
Out[8]:
10 11 12 15 about_us accept_decline account acculynk_bin_check_failed acculynk_bin_check_success acculynk_load_pin_pad ... view_resolutions view_search_results view_security_checks view_user_real_names wishlist wishlist_content_update wishlist_note your_listings your_reservations your_trips
user_id
00023iyk9l 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 5.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 2.0
0010k6l0om 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 10.0 0.0 0.0 0.0 8.0 0.0 0.0 0.0 0.0
001wyh0pz8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 66.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0028jgx1x1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
002qnbzfs5 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 125.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 524 columns

Now joining both train and session datasets together¶
In [9]:
airbnb_df = airbnb_train.merge(session_pivot, left_on='id', right_on='user_id', how='inner')
In [10]:
print("There are", airbnb_df.shape[0],"observations and", airbnb_df.shape[1], "features in the dataset")
There are 73815 observations and 540 features in the dataset
In [11]:
airbnb_df.head()
Out[11]:
id date_account_created timestamp_first_active date_first_booking gender age signup_method signup_flow language affiliate_channel ... view_resolutions view_search_results view_security_checks view_user_real_names wishlist wishlist_content_update wishlist_note your_listings your_reservations your_trips
0 d1mm9tcy42 2014-01-01 20140101000936 2014-01-04 MALE 62.0 basic 0 en sem-non-brand ... 0.0 23.0 0.0 0.0 0.0 25.0 0.0 0.0 0.0 0.0
1 yo8nz8bqcq 2014-01-01 20140101001558 NaN -unknown- NaN basic 0 en direct ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
2 4grx6yxeby 2014-01-01 20140101001639 NaN -unknown- NaN basic 0 en sem-brand ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
3 ncf87guaf0 2014-01-01 20140101002146 NaN -unknown- NaN basic 0 en direct ... 0.0 32.0 0.0 0.0 0.0 10.0 0.0 0.0 0.0 0.0
4 4rvqpxoh3h 2014-01-01 20140101002619 2014-01-02 -unknown- NaN basic 25 en direct ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 540 columns

By looking at the data, we can see that gender and first browser columns have unknown values. Replacing unknown with NaN.

In [12]:
airbnb_df.replace('-unknown-', np.nan, inplace=True)
In [13]:
airbnb_df.head()
Out[13]:
id date_account_created timestamp_first_active date_first_booking gender age signup_method signup_flow language affiliate_channel ... view_resolutions view_search_results view_security_checks view_user_real_names wishlist wishlist_content_update wishlist_note your_listings your_reservations your_trips
0 d1mm9tcy42 2014-01-01 20140101000936 2014-01-04 MALE 62.0 basic 0 en sem-non-brand ... 0.0 23.0 0.0 0.0 0.0 25.0 0.0 0.0 0.0 0.0
1 yo8nz8bqcq 2014-01-01 20140101001558 NaN NaN NaN basic 0 en direct ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
2 4grx6yxeby 2014-01-01 20140101001639 NaN NaN NaN basic 0 en sem-brand ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
3 ncf87guaf0 2014-01-01 20140101002146 NaN NaN NaN basic 0 en direct ... 0.0 32.0 0.0 0.0 0.0 10.0 0.0 0.0 0.0 0.0
4 4rvqpxoh3h 2014-01-01 20140101002619 2014-01-02 NaN NaN basic 25 en direct ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 540 columns

In [14]:
## Checking for missing value
airbnb_df.isnull().sum()
Out[14]:
id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         45041
gender                     37788
                           ...  
wishlist_content_update        3
wishlist_note                  3
your_listings                  3
your_reservations              3
your_trips                     3
Length: 540, dtype: int64
Univariant analysis¶

We will see the distribution of target variable country_destination

In [15]:
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='country_destination', data=airbnb_df)
plt.xlabel('Destination Country')
plt.ylabel('Number of users')
for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')

The data in the above graph shows that 61% of users did not book any destination (NDF-No destination found) and US(27%) seems to be a most booked country. Since all users in the dataset are from US, it shows that US travlers most likely to travel within US. Here we are only considering users who made at least one reservation, therefore we need to exclude NDF from the target column country_destination.

In [16]:
airbnb_df = airbnb_df[airbnb_df['country_destination']!='NDF']
Analyzing the user's age¶
In [17]:
airbnb_df.age.describe()
Out[17]:
count    22700.000000
mean        41.917753
std        106.748982
min         15.000000
25%         27.000000
50%         32.000000
75%         41.000000
max       2014.000000
Name: age, dtype: float64

The maximum age is 2014 which could not be possible. It looks like a year. We need to change the age column from year to age. Also need to set the limit for age like minimum age and maximum age.

In [18]:
airbnb_df_with_year = airbnb_df['age'] > 1000
airbnb_df.loc[airbnb_df_with_year, 'age'] = 2015 - airbnb_df.loc[airbnb_df_with_year, 'age']
airbnb_df.loc[airbnb_df.age > 95, 'age'] = np.nan
airbnb_df.loc[airbnb_df.age < 16, 'age'] = np.nan
airbnb_df['age'].fillna(-1, inplace=True)
In [19]:
airbnb_df.head()
Out[19]:
id date_account_created timestamp_first_active date_first_booking gender age signup_method signup_flow language affiliate_channel ... view_resolutions view_search_results view_security_checks view_user_real_names wishlist wishlist_content_update wishlist_note your_listings your_reservations your_trips
0 d1mm9tcy42 2014-01-01 20140101000936 2014-01-04 MALE 62.0 basic 0 en sem-non-brand ... 0.0 23.0 0.0 0.0 0.0 25.0 0.0 0.0 0.0 0.0
4 4rvqpxoh3h 2014-01-01 20140101002619 2014-01-02 NaN -1.0 basic 25 en direct ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 xwxei6hdk4 2014-01-01 20140101002742 2014-01-07 FEMALE 32.0 facebook 0 en seo ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 ro2stddszp 2014-01-01 20140101005503 2014-12-04 NaN 19.0 basic 0 en sem-brand ... 0.0 5.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0
10 awiurksqr3 2014-01-01 20140101010113 2014-01-02 FEMALE 32.0 facebook 0 en direct ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 540 columns

In [20]:
# Cast date time column to property date time format and Split dates into day, week, month, year.
airbnb_df["date_account_created"] = pd.to_datetime(airbnb_df["date_account_created"])
airbnb_df["date_account_created_day"] = airbnb_df.date_account_created.dt.weekday
airbnb_df["date_account_created_month"] = airbnb_df.date_account_created.dt.month
airbnb_df["date_account_created_year"] = airbnb_df.date_account_created.dt.year

# Cast date time column to property date time format and split dates into day, week, month, year.
airbnb_df['timestamp_first_active'] = pd.to_datetime(airbnb_df['timestamp_first_active'], format='%Y%m%d%H%M%S')
airbnb_df["timestamp_first_active_day"] = airbnb_df.timestamp_first_active.dt.weekday
airbnb_df["timestamp_first_active_month"] = airbnb_df.timestamp_first_active.dt.month
airbnb_df["timestamp_first_active_year"] = airbnb_df.timestamp_first_active.dt.year
airbnb_df["timestamp_first_active_hour"] = airbnb_df.timestamp_first_active.dt.hour

# Make the split datatime fields into object data types for model training
airbnb_df["date_account_created_day"] = airbnb_df["date_account_created_day"].astype("O")
airbnb_df["date_account_created_month"] = airbnb_df["date_account_created_month"].astype("O")
airbnb_df["date_account_created_year"] = airbnb_df["date_account_created_year"].astype("O")
airbnb_df["timestamp_first_active_day"] = airbnb_df["timestamp_first_active_day"].astype("O")
airbnb_df["timestamp_first_active_month"] = airbnb_df["timestamp_first_active_month"].astype("O")
airbnb_df["timestamp_first_active_year"] = airbnb_df["timestamp_first_active_year"].astype("O")
airbnb_df["timestamp_first_active_hour"] = airbnb_df["timestamp_first_active_hour"].astype("O")

# Change the signup_flow to an object data type
airbnb_df["signup_flow"] = airbnb_df["signup_flow"].astype("O")

# Dropping these columns because they have already been altered and don't need duplicate data types
drop_cols = ["date_account_created","timestamp_first_active","date_first_booking", "age", "id"]
airbnb_df.drop(columns=drop_cols, inplace=True)
In [21]:
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_day', data=airbnb_df)
plt.xlabel('Day of Week')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')

Depending on the day of week, there are fluctuations in when customers are getting onto the Airbnb site. Tuesday is the peak at 16.1% of the total dataset. With the lowest value being Friday at 11.5% of the total dataset.

In [22]:
plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_month', data=airbnb_df)
plt.xlabel('Month of Year')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5], ['January', 'February', 'March', 'April', 'May', 'June'])

for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')

The above plot shows the relationship between the month of year and the number of users that first started using the Airbnb application. There is an upward tick from January to June in this dataset which would lead us to believe that there is a relationship between month of year and number of new users.

In [23]:
# Getting categorical datasets ready for One Hot Encoding
cat_cols = [col for col in airbnb_df.columns if airbnb_df[col].dtype =="O"]
cat_cols.remove("country_destination")

print(cat_cols)

# One Hot Encoding for the category datasets
train_df = pd.get_dummies(airbnb_df, columns=cat_cols, drop_first=True)
train_df.info()
['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'date_account_created_day', 'date_account_created_month', 'date_account_created_year', 'timestamp_first_active_day', 'timestamp_first_active_month', 'timestamp_first_active_year', 'timestamp_first_active_hour']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28774 entries, 0 to 73809
Columns: 666 entries, country_destination to timestamp_first_active_hour_23
dtypes: float64(524), object(1), uint8(141)
memory usage: 119.3+ MB
In [24]:
train_df.reset_index(inplace=True)
train_df.fillna(-1, inplace=True)
In [25]:
from sklearn.preprocessing import LabelEncoder

y = train_df['country_destination']
X = train_df.drop(columns='country_destination')

# Get unique values of the column
unique_values = train_df['country_destination'].unique()

# Initialize and fit the LabelEncoder
le = LabelEncoder()
le.fit(unique_values)

# Transform the target variable using labels
y_xgb = le.transform(y)
In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.25, random_state=12)

Model Training¶

Decision Tree Clasification¶

In [27]:
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split= 50)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of decision tree model is {:.5f}'.format(accuracy))
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00        42
          CA       0.00      0.00      0.00       115
          DE       0.00      0.00      0.00        64
          ES       0.00      0.00      0.00       180
          FR       0.50      0.01      0.01       378
          GB       0.00      0.00      0.00       206
          IT       0.08      0.00      0.01       228
          NL       0.00      0.00      0.00        53
          PT       0.00      0.00      0.00        21
          US       0.70      1.00      0.82      5016
       other       0.00      0.00      0.00       891

    accuracy                           0.70      7194
   macro avg       0.12      0.09      0.08      7194
weighted avg       0.52      0.70      0.57      7194

Accuracy of decision tree model is 0.69739

Random Forest Classification¶

In [28]:
randomforest_model = RandomForestClassifier(n_estimators = 500, criterion='gini', max_depth=6)
randomforest_model.fit(X_train, y_train)
y_pred = randomforest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of random forest model is {:.5f}'.format(accuracy))
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00        42
          CA       0.00      0.00      0.00       115
          DE       0.00      0.00      0.00        64
          ES       0.00      0.00      0.00       180
          FR       0.00      0.00      0.00       378
          GB       0.00      0.00      0.00       206
          IT       0.00      0.00      0.00       228
          NL       0.00      0.00      0.00        53
          PT       0.00      0.00      0.00        21
          US       0.70      1.00      0.82      5016
       other       0.00      0.00      0.00       891

    accuracy                           0.70      7194
   macro avg       0.06      0.09      0.07      7194
weighted avg       0.49      0.70      0.57      7194

Accuracy of random forest model is 0.69725

XGBoost Classification¶

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y_xgb, test_size =0.25, random_state=12)
In [32]:
print("Parallel Parameter optimization")
xgb_model = xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                               'n_estimators': [50, 100, 200]}, verbose=1,
                   n_jobs=2)
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)
Parallel Parameter optimization
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.6987951807228916
{'max_depth': 2, 'n_estimators': 100}

After running the above code we got the max_depth of 4 and the n_estimators equal to 50

In [33]:
xgb_model = xgb.XGBClassifier(max_depth=2, n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of XGBoost Model is {:.5f}'.format(accuracy))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.00      0.00      0.00       115
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00       180
           4       0.40      0.01      0.02       378
           5       0.00      0.00      0.00       206
           6       0.20      0.00      0.01       228
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00        21
           9       0.70      1.00      0.82      5016
          10       0.44      0.01      0.02       891

    accuracy                           0.70      7194
   macro avg       0.16      0.09      0.08      7194
weighted avg       0.57      0.70      0.58      7194

Accuracy of XGBoost Model is 0.69794