In [32]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
In [33]:
train_df = pd.read_csv('/Users/Jackie/Desktop/py4e/titanic/train.csv')
test =pd.read_csv('/Users/Jackie/Desktop/py4e/titanic/test.csv')
In [34]:
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
In [35]:
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [135]:
import numpy as np
import pandas as pd
from collections import Counter
In [136]:
outlier_indices = []
In [169]:
Q1 = np.percentile(train_df['Fare'].dropna(),25)
Q3 = np.percentile(train_df['Fare'].dropna(),75)
IQR = Q3 - Q1
outlier_step = IQR * 1.5
print(Q1)
print(Q3)
print(IQR)
print(outlier_step)
7.9104
31.0
23.0896
34.6344
In [172]:
outlier_list_col = train_df[(train_df['Fare'] < Q1-outlier_step) | (train_df['Fare'] > Q3+outlier_step) ].index
In [171]:
outlier_list_col
Out[171]:
Int64Index([  1,  27,  31,  34,  52,  61,  62,  72,  88, 102,
            ...
            792, 802, 820, 829, 835, 846, 849, 856, 863, 879],
           dtype='int64', length=116)
In [140]:
# append the found outlier indices for col to the list of outlier 
outlier_indices.extend(outlier_list_col)
In [141]:
outlier_indices
Out[141]:
[1,
 27,
 31,
 34,
 52,
 61,
 62,
 72,
 88,
 102,
 118,
 120,
 124,
 139,
 151,
 159,
 180,
 195,
 201,
 215,
 218,
 224,
 230,
 245,
 256,
 257,
 258,
 262,
 268,
 269,
 275,
 290,
 291,
 297,
 299,
 305,
 306,
 307,
 310,
 311,
 318,
 319,
 324,
 325,
 332,
 334,
 336,
 337,
 341,
 366,
 369,
 373,
 375,
 377,
 380,
 385,
 390,
 393,
 412,
 435,
 438,
 445,
 453,
 484,
 486,
 496,
 498,
 504,
 505,
 520,
 527,
 537,
 540,
 544,
 550,
 557,
 558,
 581,
 585,
 587,
 591,
 609,
 627,
 641,
 645,
 655,
 659,
 660,
 665,
 679,
 681,
 689,
 698,
 700,
 708,
 716,
 730,
 737,
 741,
 742,
 745,
 759,
 763,
 765,
 779,
 789,
 792,
 802,
 820,
 829,
 835,
 846,
 849,
 856,
 863,
 879]
In [142]:
#count and convert to dictionary
outlier_indices = Counter(outlier_indices)
In [143]:
outlier_indices
Out[143]:
Counter({1: 1,
         27: 1,
         31: 1,
         34: 1,
         52: 1,
         61: 1,
         62: 1,
         72: 1,
         88: 1,
         102: 1,
         118: 1,
         120: 1,
         124: 1,
         139: 1,
         151: 1,
         159: 1,
         180: 1,
         195: 1,
         201: 1,
         215: 1,
         218: 1,
         224: 1,
         230: 1,
         245: 1,
         256: 1,
         257: 1,
         258: 1,
         262: 1,
         268: 1,
         269: 1,
         275: 1,
         290: 1,
         291: 1,
         297: 1,
         299: 1,
         305: 1,
         306: 1,
         307: 1,
         310: 1,
         311: 1,
         318: 1,
         319: 1,
         324: 1,
         325: 1,
         332: 1,
         334: 1,
         336: 1,
         337: 1,
         341: 1,
         366: 1,
         369: 1,
         373: 1,
         375: 1,
         377: 1,
         380: 1,
         385: 1,
         390: 1,
         393: 1,
         412: 1,
         435: 1,
         438: 1,
         445: 1,
         453: 1,
         484: 1,
         486: 1,
         496: 1,
         498: 1,
         504: 1,
         505: 1,
         520: 1,
         527: 1,
         537: 1,
         540: 1,
         544: 1,
         550: 1,
         557: 1,
         558: 1,
         581: 1,
         585: 1,
         587: 1,
         591: 1,
         609: 1,
         627: 1,
         641: 1,
         645: 1,
         655: 1,
         659: 1,
         660: 1,
         665: 1,
         679: 1,
         681: 1,
         689: 1,
         698: 1,
         700: 1,
         708: 1,
         716: 1,
         730: 1,
         737: 1,
         741: 1,
         742: 1,
         745: 1,
         759: 1,
         763: 1,
         765: 1,
         779: 1,
         789: 1,
         792: 1,
         802: 1,
         820: 1,
         829: 1,
         835: 1,
         846: 1,
         849: 1,
         856: 1,
         863: 1,
         879: 1})
In [127]:
#alternative way
list = []
for x, y in outlier_indices.items():
    if y == 1:
        list.append(x)
print(list)
[1, 27, 31, 34, 52, 61, 62, 72, 88, 102, 118, 120, 124, 139, 151, 159, 180, 195, 201, 215, 218, 224, 230, 245, 256, 257, 258, 262, 268, 269, 275, 290, 291, 297, 299, 305, 306, 307, 310, 311, 318, 319, 324, 325, 332, 334, 336, 337, 341, 366, 369, 373, 375, 377, 380, 385, 390, 393, 412, 435, 438, 445, 453, 484, 486, 496, 498, 504, 505, 520, 527, 537, 540, 544, 550, 557, 558, 581, 585, 587, 591, 609, 627, 641, 645, 655, 659, 660, 665, 679, 681, 689, 698, 700, 708, 716, 730, 737, 741, 742, 745, 759, 763, 765, 779, 789, 792, 802, 820, 829, 835, 846, 849, 856, 863, 879]
In [163]:
multiple = [k for k, v in outlier_indices.items() if v==1]
In [164]:
[k for k, v in outlier_indices.items() if v==1]
Out[164]:
[1,
 27,
 31,
 34,
 52,
 61,
 62,
 72,
 88,
 102,
 118,
 120,
 124,
 139,
 151,
 159,
 180,
 195,
 201,
 215,
 218,
 224,
 230,
 245,
 256,
 257,
 258,
 262,
 268,
 269,
 275,
 290,
 291,
 297,
 299,
 305,
 306,
 307,
 310,
 311,
 318,
 319,
 324,
 325,
 332,
 334,
 336,
 337,
 341,
 366,
 369,
 373,
 375,
 377,
 380,
 385,
 390,
 393,
 412,
 435,
 438,
 445,
 453,
 484,
 486,
 496,
 498,
 504,
 505,
 520,
 527,
 537,
 540,
 544,
 550,
 557,
 558,
 581,
 585,
 587,
 591,
 609,
 627,
 641,
 645,
 655,
 659,
 660,
 665,
 679,
 681,
 689,
 698,
 700,
 708,
 716,
 730,
 737,
 741,
 742,
 745,
 759,
 763,
 765,
 779,
 789,
 792,
 802,
 820,
 829,
 835,
 846,
 849,
 856,
 863,
 879]
In [173]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col].dropna(), 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col].dropna(),75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = [k for k, v in outlier_indices.items() if v > n ]
    
    return multiple_outliers   
In [174]:
Outliers_to_drop = detect_outliers(train_df,2,["Age","SibSp","Parch","Fare"])
In [177]:
Outliers_to_drop
Out[177]:
[745, 27, 88, 159, 180, 201, 324, 341, 792, 846, 863]
In [175]:
df_train = train_df.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
In [176]:
df_train
Out[176]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
875 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
876 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
877 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
878 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
879 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

880 rows × 12 columns

In [ ]: