Данные взяты от сюда
# стандартный импорт от azure ml
from azureml import Workspace
# эти импорты понадобятся нам позже
from numpy import sort
from matplotlib.pyplot import show
# стандартная загрузка данных
ws = Workspace()
ds = ws.datasets['BP_python_train.csv']
df = ds.to_dataframe()
Для начала стоит изучить наши данные. Что они из себя представляю, какие признаки надо выбрать, какие можно выбросить, с какими надо еще пораобтать (нормализовать, почистить)
df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
11 | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 113783 | 26.5500 | C103 | S |
12 | 13 | 0 | 3 | Saundercock, Mr. William Henry | male | 20.0 | 0 | 0 | A/5. 2151 | 8.0500 | NaN | S |
13 | 14 | 0 | 3 | Andersson, Mr. Anders Johan | male | 39.0 | 1 | 5 | 347082 | 31.2750 | NaN | S |
14 | 15 | 0 | 3 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0 | 0 | 0 | 350406 | 7.8542 | NaN | S |
15 | 16 | 1 | 2 | Hewlett, Mrs. (Mary D Kingcome) | female | 55.0 | 0 | 0 | 248706 | 16.0000 | NaN | S |
16 | 17 | 0 | 3 | Rice, Master. Eugene | male | 2.0 | 4 | 1 | 382652 | 29.1250 | NaN | Q |
17 | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | NaN | 0 | 0 | 244373 | 13.0000 | NaN | S |
18 | 19 | 0 | 3 | Vander Planke, Mrs. Julius (Emelia Maria Vande... | female | 31.0 | 1 | 0 | 345763 | 18.0000 | NaN | S |
19 | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | NaN | 0 | 0 | 2649 | 7.2250 | NaN | C |
20 | 21 | 0 | 2 | Fynney, Mr. Joseph J | male | 35.0 | 0 | 0 | 239865 | 26.0000 | NaN | S |
21 | 22 | 1 | 2 | Beesley, Mr. Lawrence | male | 34.0 | 0 | 0 | 248698 | 13.0000 | D56 | S |
22 | 23 | 1 | 3 | McGowan, Miss. Anna "Annie" | female | 15.0 | 0 | 0 | 330923 | 8.0292 | NaN | Q |
23 | 24 | 1 | 1 | Sloper, Mr. William Thompson | male | 28.0 | 0 | 0 | 113788 | 35.5000 | A6 | S |
24 | 25 | 0 | 3 | Palsson, Miss. Torborg Danira | female | 8.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
25 | 26 | 1 | 3 | Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... | female | 38.0 | 1 | 5 | 347077 | 31.3875 | NaN | S |
26 | 27 | 0 | 3 | Emir, Mr. Farred Chehab | male | NaN | 0 | 0 | 2631 | 7.2250 | NaN | C |
27 | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
28 | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | NaN | 0 | 0 | 330959 | 7.8792 | NaN | Q |
29 | 30 | 0 | 3 | Todoroff, Mr. Lalio | male | NaN | 0 | 0 | 349216 | 7.8958 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
861 | 862 | 0 | 2 | Giles, Mr. Frederick Edward | male | 21.0 | 1 | 0 | 28134 | 11.5000 | NaN | S |
862 | 863 | 1 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | female | 48.0 | 0 | 0 | 17466 | 25.9292 | D17 | S |
863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | NaN | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
864 | 865 | 0 | 2 | Gill, Mr. John William | male | 24.0 | 0 | 0 | 233866 | 13.0000 | NaN | S |
865 | 866 | 1 | 2 | Bystrom, Mrs. (Karolina) | female | 42.0 | 0 | 0 | 236852 | 13.0000 | NaN | S |
866 | 867 | 1 | 2 | Duran y More, Miss. Asuncion | female | 27.0 | 1 | 0 | SC/PARIS 2149 | 13.8583 | NaN | C |
867 | 868 | 0 | 1 | Roebling, Mr. Washington Augustus II | male | 31.0 | 0 | 0 | PC 17590 | 50.4958 | A24 | S |
868 | 869 | 0 | 3 | van Melkebeke, Mr. Philemon | male | NaN | 0 | 0 | 345777 | 9.5000 | NaN | S |
869 | 870 | 1 | 3 | Johnson, Master. Harold Theodor | male | 4.0 | 1 | 1 | 347742 | 11.1333 | NaN | S |
870 | 871 | 0 | 3 | Balkic, Mr. Cerin | male | 26.0 | 0 | 0 | 349248 | 7.8958 | NaN | S |
871 | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0 | 1 | 1 | 11751 | 52.5542 | D35 | S |
872 | 873 | 0 | 1 | Carlsson, Mr. Frans Olof | male | 33.0 | 0 | 0 | 695 | 5.0000 | B51 B53 B55 | S |
873 | 874 | 0 | 3 | Vander Cruyssen, Mr. Victor | male | 47.0 | 0 | 0 | 345765 | 9.0000 | NaN | S |
874 | 875 | 1 | 2 | Abelson, Mrs. Samuel (Hannah Wizosky) | female | 28.0 | 1 | 0 | P/PP 3381 | 24.0000 | NaN | C |
875 | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15.0 | 0 | 0 | 2667 | 7.2250 | NaN | C |
876 | 877 | 0 | 3 | Gustafsson, Mr. Alfred Ossian | male | 20.0 | 0 | 0 | 7534 | 9.8458 | NaN | S |
877 | 878 | 0 | 3 | Petroff, Mr. Nedelio | male | 19.0 | 0 | 0 | 349212 | 7.8958 | NaN | S |
878 | 879 | 0 | 3 | Laleff, Mr. Kristo | male | NaN | 0 | 0 | 349217 | 7.8958 | NaN | S |
879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
880 | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NaN | S |
881 | 882 | 0 | 3 | Markun, Mr. Johann | male | 33.0 | 0 | 0 | 349257 | 7.8958 | NaN | S |
882 | 883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.0 | 0 | 0 | 7552 | 10.5167 | NaN | S |
883 | 884 | 0 | 2 | Banfield, Mr. Frederick James | male | 28.0 | 0 | 0 | C.A./SOTON 34068 | 10.5000 | NaN | S |
884 | 885 | 0 | 3 | Sutehall, Mr. Henry Jr | male | 25.0 | 0 | 0 | SOTON/OQ 392076 | 7.0500 | NaN | S |
885 | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NaN | Q |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
PassengerId
- уникальный индетификатор каждого человекаSurvived
- выжил ли человекPclass
- класс пассажираName
- имя пассажираSex
- полAge
- возрастSibSp
- количество родственников на бортуParch
- количество прямых родственников на бортуTicket
- номер билетаFare
- стоимость билетаCabin
- кабинаEmbarked
- порт посадки
Каждая запись эквивалентна одному пассажиру
df.Survived.unique()
array([0, 1])
df.Pclass.unique()
array([3, 1, 2])
df.Sex.unique()
array(['male', 'female'], dtype=object)
df.Age.describe()
count 891.000000
mean 29.361582
std 13.019697
min 0.420000
25% 22.000000
50% 28.000000
75% 35.000000
max 80.000000
Name: Age, dtype: float64
У возраста некоторые значения не заполнены. Посмотрим, сколько значений заполннено
df.PassengerId[df.Age.notnull()].count()
714
Не так много по сравнению с количеством записей. Посмотрим распределение возрастов (потому что графики - это красиво)
df.Age.plot(kind='kde')
show()
Выглядит вполне нормально. Хорошим решением будет заменить отсутствующие значения на медиану
df.SibSp.unique()
array([1, 0, 3, 4, 2, 5, 8])
df.Parch.unique()
array([0, 1, 2, 5, 3, 4, 6])
df.Fare.describe()
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: Fare, dtype: float64
df[df.Embarked.isnull()]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
61 | 62 | 1 | 1 | Icard, Miss. Amelie | female | 38.0 | 0 | 0 | 113572 | 80.0 | B28 | NaN |
829 | 830 | 1 | 1 | Stone, Mrs. George Nelson (Martha Evelyn) | female | 62.0 | 0 | 0 | 113572 | 80.0 | B28 | NaN |
Всего у двух значений не указан порт
df.PassengerId[df.Cabin.notnull()].count()
204
Лишь 25% кабин заполнено значениями
Итак, почти все значения (кроме Name
, PassengerId
, Ticket
, но о них позже) изучены. Теперь самое время начать обрабатывать данные
df.Age = df.Age.fillna(df.Age.median())
max_embarked = df.groupby("Embarked").count()['PassengerId']
df.Embarked = df.Embarked.fillna(max_embarked[max_embarked == max_embarked.max()].index[0])
Name
, PassengerId
, Ticket
- эти значения являются просто идентификатором и по сути в обучении нам никаки не помогут, поэтому хорошей идеей будет дропнуть их.
Заодно дропнем Cabin
, из-за малого количества заполненных значений
df = df.drop(["Name", "Ticket", "PassengerId", "Cabin"], axis=1)
df
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
5 | 0 | 3 | male | 28.0 | 0 | 0 | 8.4583 | Q |
6 | 0 | 1 | male | 54.0 | 0 | 0 | 51.8625 | S |
7 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S |
8 | 1 | 3 | female | 27.0 | 0 | 2 | 11.1333 | S |
9 | 1 | 2 | female | 14.0 | 1 | 0 | 30.0708 | C |
10 | 1 | 3 | female | 4.0 | 1 | 1 | 16.7000 | S |
11 | 1 | 1 | female | 58.0 | 0 | 0 | 26.5500 | S |
12 | 0 | 3 | male | 20.0 | 0 | 0 | 8.0500 | S |
13 | 0 | 3 | male | 39.0 | 1 | 5 | 31.2750 | S |
14 | 0 | 3 | female | 14.0 | 0 | 0 | 7.8542 | S |
15 | 1 | 2 | female | 55.0 | 0 | 0 | 16.0000 | S |
16 | 0 | 3 | male | 2.0 | 4 | 1 | 29.1250 | Q |
17 | 1 | 2 | male | 28.0 | 0 | 0 | 13.0000 | S |
18 | 0 | 3 | female | 31.0 | 1 | 0 | 18.0000 | S |
19 | 1 | 3 | female | 28.0 | 0 | 0 | 7.2250 | C |
20 | 0 | 2 | male | 35.0 | 0 | 0 | 26.0000 | S |
21 | 1 | 2 | male | 34.0 | 0 | 0 | 13.0000 | S |
22 | 1 | 3 | female | 15.0 | 0 | 0 | 8.0292 | Q |
23 | 1 | 1 | male | 28.0 | 0 | 0 | 35.5000 | S |
24 | 0 | 3 | female | 8.0 | 3 | 1 | 21.0750 | S |
25 | 1 | 3 | female | 38.0 | 1 | 5 | 31.3875 | S |
26 | 0 | 3 | male | 28.0 | 0 | 0 | 7.2250 | C |
27 | 0 | 1 | male | 19.0 | 3 | 2 | 263.0000 | S |
28 | 1 | 3 | female | 28.0 | 0 | 0 | 7.8792 | Q |
29 | 0 | 3 | male | 28.0 | 0 | 0 | 7.8958 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... |
861 | 0 | 2 | male | 21.0 | 1 | 0 | 11.5000 | S |
862 | 1 | 1 | female | 48.0 | 0 | 0 | 25.9292 | S |
863 | 0 | 3 | female | 28.0 | 8 | 2 | 69.5500 | S |
864 | 0 | 2 | male | 24.0 | 0 | 0 | 13.0000 | S |
865 | 1 | 2 | female | 42.0 | 0 | 0 | 13.0000 | S |
866 | 1 | 2 | female | 27.0 | 1 | 0 | 13.8583 | C |
867 | 0 | 1 | male | 31.0 | 0 | 0 | 50.4958 | S |
868 | 0 | 3 | male | 28.0 | 0 | 0 | 9.5000 | S |
869 | 1 | 3 | male | 4.0 | 1 | 1 | 11.1333 | S |
870 | 0 | 3 | male | 26.0 | 0 | 0 | 7.8958 | S |
871 | 1 | 1 | female | 47.0 | 1 | 1 | 52.5542 | S |
872 | 0 | 1 | male | 33.0 | 0 | 0 | 5.0000 | S |
873 | 0 | 3 | male | 47.0 | 0 | 0 | 9.0000 | S |
874 | 1 | 2 | female | 28.0 | 1 | 0 | 24.0000 | C |
875 | 1 | 3 | female | 15.0 | 0 | 0 | 7.2250 | C |
876 | 0 | 3 | male | 20.0 | 0 | 0 | 9.8458 | S |
877 | 0 | 3 | male | 19.0 | 0 | 0 | 7.8958 | S |
878 | 0 | 3 | male | 28.0 | 0 | 0 | 7.8958 | S |
879 | 1 | 1 | female | 56.0 | 0 | 1 | 83.1583 | C |
880 | 1 | 2 | female | 25.0 | 0 | 1 | 26.0000 | S |
881 | 0 | 3 | male | 33.0 | 0 | 0 | 7.8958 | S |
882 | 0 | 3 | female | 22.0 | 0 | 0 | 10.5167 | S |
883 | 0 | 2 | male | 28.0 | 0 | 0 | 10.5000 | S |
884 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S |
885 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S |
887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S |
888 | 0 | 3 | female | 28.0 | 1 | 2 | 23.4500 | S |
889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C |
890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q |
891 rows × 8 columns