@@ -70,129 +70,7 @@ print(projection.create("titanic_testing",
7070data_type_handler = DataTypeHandler()
7171type_fields = {
7272 " Age" : " number" ,
73- " Pclass" : " number" ,
74- " SibSp" : " number"
75- }
76-
77- print (data_type_handler.change_file_type(
78- " titanic_testing_projection" ,
79- type_fields))
80-
81- type_fields[" Survived" ] = " number"
82-
83- print (data_type_handler.change_file_type(
84- " titanic_training_projection" ,
85- type_fields))
86-
87-
88- preprocessing_code = '''
89- from pyspark.ml import Pipeline
90- from pyspark.sql.functions import (
91- mean, col, split,
92- regexp_extract, when, lit)
93-
94- from pyspark.ml.feature import (
95- VectorAssembler,
96- StringIndexer
97- )
98-
99- TRAINING_DF_INDEX = 0
100- TESTING_DF_INDEX = 1
101-
102- training_df = training_df.withColumnRenamed('Survived', 'label')
103- testing_df = testing_df.withColumn('label', lit(0))
104- datasets_list = [training_df, testing_df]
105-
106- for index, dataset in enumerate(datasets_list):
107- dataset = dataset.withColumn(
108- "Initial",
109- regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
110- datasets_list[index] = dataset
111-
112- misspelled_initials = [
113- 'Mlle', 'Mme', 'Ms', 'Dr',
114- 'Major', 'Lady', 'Countess',
115- 'Jonkheer', 'Col', 'Rev',
116- 'Capt', 'Sir', 'Don'
117- ]
118- correct_initials = [
119- 'Miss', 'Miss', 'Miss', 'Mr',
120- 'Mr', 'Mrs', 'Mrs',
121- 'Other', 'Other', 'Other',
122- 'Mr', 'Mr', 'Mr'
123- ]
124- for index, dataset in enumerate(datasets_list):
125- dataset = dataset.replace(misspelled_initials, correct_initials)
126- datasets_list[index] = dataset
127-
128-
129- initials_age = {"Miss": 22,
130- "Other": 46,
131- "Master": 5,
132- "Mr": 33,
133- "Mrs": 36}
134- for index, dataset in enumerate(datasets_list):
135- for initial, initial_age in initials_age.items():
136- dataset = dataset.withColumn(
137- "Age",
138- when((dataset["Initial"] == initial) &
139- (dataset["Age"].isNull()), initial_age).otherwise(
140- dataset["Age"]))
141- datasets_list[index] = dataset
142-
143-
144- for index, dataset in enumerate(datasets_list):
145- dataset = dataset.na.fill({"Embarked": 'S'})
146- datasets_list[index] = dataset
147-
148-
149- for index, dataset in enumerate(datasets_list):
150- dataset = dataset.withColumn("Family_Size", col('SibSp')+col('Parch'))
151- dataset = dataset.withColumn('Alone', lit(0))
152- dataset = dataset.withColumn(
153- "Alone",
154- when(dataset["Family_Size"] == 0, 1).otherwise(dataset["Alone"]))
155- datasets_list[index] = dataset
156-
157-
158- text_fields = ["Sex", "Embarked", "Initial"]
159- for column in text_fields:
160- for index, dataset in enumerate(datasets_list):
161- dataset = StringIndexer(
162- inputCol=column, outputCol=column+"_index").\
163- fit(dataset).\
164- transform(dataset)
165- datasets_list[index] = dataset
166-
167-
168- non_required_columns = ["Name", "Embarked", "Sex", "Initial"]
169- for index, dataset in enumerate(datasets_list):
170- dataset = dataset.drop(*non_required_columns)
171- datasets_list[index] = dataset
172-
173-
174- training_df = datasets_list[TRAINING_DF_INDEX]
175- testing_df = datasets_list[TESTING_DF_INDEX]
176-
177- assembler = VectorAssembler(
178- inputCols=training_df.columns[:],
179- outputCol="features")
180- assembler.setHandleInvalid('skip')
181-
182- features_training = assembler.transform(training_df)
183- (features_training, features_evaluation) =\
184- features_training.randomSplit([0.8, 0.2], seed=33)
185- features_testing = assembler.transform(testing_df)
186- '''
187-
188- model_builder = Model()
189-
190- print (model_builder.create_model(
191- " titanic_training_projection" ,
192- " titanic_testing_projection" ,
193- preprocessing_code,
194- [" lr" , " dt" , " gb" , " rf" , " nb" ]))
195- ``` " Fare" : " number" ,
73+ " Fare" : " number" ,
19674 " Parch" : " number" ,
19775 " PassengerId" : " number" ,
19876 " Pclass" : " number" ,
@@ -318,7 +196,7 @@ print(model_builder.create_model(
318196 preprocessing_code,
319197 [" lr" , " dt" , " gb" , " rf" , " nb" ]))
320198```
321-
199+ # Function APIs
322200
323201## DatabaseApi
324202
0 commit comments