## Load the libraries
import os
import numpy as np 
import pandas as pd 

# data viz
import matplotlib.pyplot as plt 
import seaborn as sns

# machine learning
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import multiprocessing

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


## Load the dataset into a pandas data frame
airbnb_train = pd.read_csv(os.path.abspath('train_users_2.csv'))
airbnb_test = pd.read_csv(os.path.abspath('test_users.csv'))
airbnb_sessions = pd.read_csv(os.path.abspath('sessions.csv'))


print("There are", airbnb_train.shape[0], "users and",  airbnb_train.shape[1], "features in the training set and", airbnb_test.shape[0], "users and", airbnb_test.shape[1], "features in the test set.")

There are 213451 users and 16 features in the training set and 62096 users and 15 features in the test set.


print("Total users in training and test sets are:", airbnb_train.shape[0]+airbnb_test.shape[0])

Total users in training and test sets are: 275547


print("There are", airbnb_sessions.shape[0], "users and", airbnb_sessions.shape[1], "features in the session set")

There are 10567737 users and 6 features in the session set


session1 = airbnb_sessions.groupby(["user_id","action"]).agg({"action": "count"})
session2 = airbnb_sessions.groupby(["user_id","action_type"]).agg({"action_type": "count"})
session3 = airbnb_sessions.groupby(["user_id","action_detail"]).agg({"action_detail": "count"})  

session1 = session1.rename(columns={'action':'action_count'}).reset_index()
session2 = session2.rename(columns={'action_type':'action_type_count'}).reset_index()
session3 = session3.rename(columns={'action_detail':'action_detail_count'}).reset_index()


session1 = session1.pivot_table(index='user_id', columns='action', values='action_count').fillna(0)
session2 = session2.pivot_table(index='user_id', columns='action_type', values='action_type_count').fillna(0)
session3 = session3.pivot_table(index='user_id', columns='action_detail', values='action_detail_count').fillna(0)

session_pivot = session1.merge(session2, on='user_id', how='outer').merge(session3, on='user_id', how='outer')


session_pivot.head()


airbnb_df = airbnb_train.merge(session_pivot, left_on='id', right_on='user_id', how='inner')


print("There are", airbnb_df.shape[0],"observations and", airbnb_df.shape[1], "features in the dataset")

There are 73815 observations and 540 features in the dataset


airbnb_df.head()


airbnb_df.replace('-unknown-', np.nan, inplace=True)


airbnb_df.head()


## Checking for missing value
airbnb_df.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         45041
gender                     37788
                           ...  
wishlist_content_update        3
wishlist_note                  3
your_listings                  3
your_reservations              3
your_trips                     3
Length: 540, dtype: int64


plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='country_destination', data=airbnb_df)
plt.xlabel('Destination Country')
plt.ylabel('Number of users')
for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')


airbnb_df = airbnb_df[airbnb_df['country_destination']!='NDF']


airbnb_df.age.describe()

count    22700.000000
mean        41.917753
std        106.748982
min         15.000000
25%         27.000000
50%         32.000000
75%         41.000000
max       2014.000000
Name: age, dtype: float64


airbnb_df_with_year = airbnb_df['age'] > 1000
airbnb_df.loc[airbnb_df_with_year, 'age'] = 2015 - airbnb_df.loc[airbnb_df_with_year, 'age']
airbnb_df.loc[airbnb_df.age > 95, 'age'] = np.nan
airbnb_df.loc[airbnb_df.age < 16, 'age'] = np.nan
airbnb_df['age'].fillna(-1, inplace=True)


airbnb_df.head()


# Cast date time column to property date time format and Split dates into day, week, month, year.
airbnb_df["date_account_created"] = pd.to_datetime(airbnb_df["date_account_created"])
airbnb_df["date_account_created_day"] = airbnb_df.date_account_created.dt.weekday
airbnb_df["date_account_created_month"] = airbnb_df.date_account_created.dt.month
airbnb_df["date_account_created_year"] = airbnb_df.date_account_created.dt.year

# Cast date time column to property date time format and split dates into day, week, month, year.
airbnb_df['timestamp_first_active'] = pd.to_datetime(airbnb_df['timestamp_first_active'], format='%Y%m%d%H%M%S')
airbnb_df["timestamp_first_active_day"] = airbnb_df.timestamp_first_active.dt.weekday
airbnb_df["timestamp_first_active_month"] = airbnb_df.timestamp_first_active.dt.month
airbnb_df["timestamp_first_active_year"] = airbnb_df.timestamp_first_active.dt.year
airbnb_df["timestamp_first_active_hour"] = airbnb_df.timestamp_first_active.dt.hour

# Make the split datatime fields into object data types for model training
airbnb_df["date_account_created_day"] = airbnb_df["date_account_created_day"].astype("O")
airbnb_df["date_account_created_month"] = airbnb_df["date_account_created_month"].astype("O")
airbnb_df["date_account_created_year"] = airbnb_df["date_account_created_year"].astype("O")
airbnb_df["timestamp_first_active_day"] = airbnb_df["timestamp_first_active_day"].astype("O")
airbnb_df["timestamp_first_active_month"] = airbnb_df["timestamp_first_active_month"].astype("O")
airbnb_df["timestamp_first_active_year"] = airbnb_df["timestamp_first_active_year"].astype("O")
airbnb_df["timestamp_first_active_hour"] = airbnb_df["timestamp_first_active_hour"].astype("O")

# Change the signup_flow to an object data type
airbnb_df["signup_flow"] = airbnb_df["signup_flow"].astype("O")

# Dropping these columns because they have already been altered and don't need duplicate data types
drop_cols = ["date_account_created","timestamp_first_active","date_first_booking", "age", "id"]
airbnb_df.drop(columns=drop_cols, inplace=True)


plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_day', data=airbnb_df)
plt.xlabel('Day of Week')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')


plt.figure(figsize=(10,5))
total = float(len(airbnb_df))
plot = sns.countplot(x='timestamp_first_active_month', data=airbnb_df)
plt.xlabel('Month of Year')
plt.ylabel('Number of users')
plt.xticks([0, 1, 2, 3, 4, 5], ['January', 'February', 'March', 'April', 'May', 'June'])

for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    plot.annotate(percentage, (x, y),ha='center')


# Getting categorical datasets ready for One Hot Encoding
cat_cols = [col for col in airbnb_df.columns if airbnb_df[col].dtype =="O"]
cat_cols.remove("country_destination")

print(cat_cols)

# One Hot Encoding for the category datasets
train_df = pd.get_dummies(airbnb_df, columns=cat_cols, drop_first=True)
train_df.info()

['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'date_account_created_day', 'date_account_created_month', 'date_account_created_year', 'timestamp_first_active_day', 'timestamp_first_active_month', 'timestamp_first_active_year', 'timestamp_first_active_hour']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28774 entries, 0 to 73809
Columns: 666 entries, country_destination to timestamp_first_active_hour_23
dtypes: float64(524), object(1), uint8(141)
memory usage: 119.3+ MB


train_df.reset_index(inplace=True)
train_df.fillna(-1, inplace=True)


from sklearn.preprocessing import LabelEncoder

y = train_df['country_destination']
X = train_df.drop(columns='country_destination')

# Get unique values of the column
unique_values = train_df['country_destination'].unique()

# Initialize and fit the LabelEncoder
le = LabelEncoder()
le.fit(unique_values)

# Transform the target variable using labels
y_xgb = le.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.25, random_state=12)


tree_model = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split= 50)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of decision tree model is {:.5f}'.format(accuracy))

              precision    recall  f1-score   support

          AU       0.00      0.00      0.00        42
          CA       0.00      0.00      0.00       115
          DE       0.00      0.00      0.00        64
          ES       0.00      0.00      0.00       180
          FR       0.50      0.01      0.01       378
          GB       0.00      0.00      0.00       206
          IT       0.08      0.00      0.01       228
          NL       0.00      0.00      0.00        53
          PT       0.00      0.00      0.00        21
          US       0.70      1.00      0.82      5016
       other       0.00      0.00      0.00       891

    accuracy                           0.70      7194
   macro avg       0.12      0.09      0.08      7194
weighted avg       0.52      0.70      0.57      7194

Accuracy of decision tree model is 0.69739


randomforest_model = RandomForestClassifier(n_estimators = 500, criterion='gini', max_depth=6)
randomforest_model.fit(X_train, y_train)
y_pred = randomforest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of random forest model is {:.5f}'.format(accuracy))

              precision    recall  f1-score   support

          AU       0.00      0.00      0.00        42
          CA       0.00      0.00      0.00       115
          DE       0.00      0.00      0.00        64
          ES       0.00      0.00      0.00       180
          FR       0.00      0.00      0.00       378
          GB       0.00      0.00      0.00       206
          IT       0.00      0.00      0.00       228
          NL       0.00      0.00      0.00        53
          PT       0.00      0.00      0.00        21
          US       0.70      1.00      0.82      5016
       other       0.00      0.00      0.00       891

    accuracy                           0.70      7194
   macro avg       0.06      0.09      0.07      7194
weighted avg       0.49      0.70      0.57      7194

Accuracy of random forest model is 0.69725


X_train, X_test, y_train, y_test = train_test_split(X, y_xgb, test_size =0.25, random_state=12)


print("Parallel Parameter optimization")
xgb_model = xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                               'n_estimators': [50, 100, 200]}, verbose=1,
                   n_jobs=2)
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

Parallel Parameter optimization
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.6987951807228916
{'max_depth': 2, 'n_estimators': 100}


xgb_model = xgb.XGBClassifier(max_depth=2, n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of XGBoost Model is {:.5f}'.format(accuracy))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.00      0.00      0.00       115
           2       0.00      0.00      0.00        64
           3       0.00      0.00      0.00       180
           4       0.40      0.01      0.02       378
           5       0.00      0.00      0.00       206
           6       0.20      0.00      0.01       228
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00        21
           9       0.70      1.00      0.82      5016
          10       0.44      0.01      0.02       891

    accuracy                           0.70      7194
   macro avg       0.16      0.09      0.08      7194
weighted avg       0.57      0.70      0.58      7194

Accuracy of XGBoost Model is 0.69794

Exploratory analysis¶

Now joining both train and session datasets together¶

Univariant analysis¶

Analyzing the user's age¶

Model Training¶

Decision Tree Clasification¶

Random Forest Classification¶

XGBoost Classification¶

	10	11	12	15	about_us	accept_decline	account	acculynk_bin_check_failed	acculynk_bin_check_success	acculynk_load_pin_pad	...	view_resolutions	view_search_results	view_security_checks	view_user_real_names	wishlist	wishlist_content_update	wishlist_note	your_listings	your_reservations	your_trips
user_id
00023iyk9l	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	5.0	0.0	0.0	0.0	4.0	0.0	0.0	0.0	2.0
0010k6l0om	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	10.0	0.0	0.0	0.0	8.0	0.0	0.0	0.0	0.0
001wyh0pz8	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	66.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0028jgx1x1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	9.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
002qnbzfs5	9.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	125.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	id	date_account_created	timestamp_first_active	date_first_booking	gender	age	signup_method	signup_flow	language	affiliate_channel	...	view_search_results	wishlist_content_update
0	d1mm9tcy42	2014-01-01	20140101000936	2014-01-04	MALE	62.0	basic	0	en	sem-non-brand	...	23.0	25.0
1	yo8nz8bqcq	2014-01-01	20140101001558	NaN	-unknown-	NaN	basic	0	en	direct	...	0.0	1.0
2	4grx6yxeby	2014-01-01	20140101001639	NaN	-unknown-	NaN	basic	0	en	sem-brand	...	0.0	1.0
3	ncf87guaf0	2014-01-01	20140101002146	NaN	-unknown-	NaN	basic	0	en	direct	...	32.0	10.0
4	4rvqpxoh3h	2014-01-01	20140101002619	2014-01-02	-unknown-	NaN	basic	25	en	direct	...	0.0	0.0