Skip to content

Commit 436e9f8

Browse files
committed
fixed example
1 parent a206ac0 commit 436e9f8

1 file changed

Lines changed: 2 additions & 124 deletions

File tree

README.md

Lines changed: 2 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -70,129 +70,7 @@ print(projection.create("titanic_testing",
7070
data_type_handler = DataTypeHandler()
7171
type_fields = {
7272
"Age": "number",
73-
"Pclass": "number",
74-
"SibSp": "number"
75-
}
76-
77-
print(data_type_handler.change_file_type(
78-
"titanic_testing_projection",
79-
type_fields))
80-
81-
type_fields["Survived"] = "number"
82-
83-
print(data_type_handler.change_file_type(
84-
"titanic_training_projection",
85-
type_fields))
86-
87-
88-
preprocessing_code = '''
89-
from pyspark.ml import Pipeline
90-
from pyspark.sql.functions import (
91-
mean, col, split,
92-
regexp_extract, when, lit)
93-
94-
from pyspark.ml.feature import (
95-
VectorAssembler,
96-
StringIndexer
97-
)
98-
99-
TRAINING_DF_INDEX = 0
100-
TESTING_DF_INDEX = 1
101-
102-
training_df = training_df.withColumnRenamed('Survived', 'label')
103-
testing_df = testing_df.withColumn('label', lit(0))
104-
datasets_list = [training_df, testing_df]
105-
106-
for index, dataset in enumerate(datasets_list):
107-
dataset = dataset.withColumn(
108-
"Initial",
109-
regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
110-
datasets_list[index] = dataset
111-
112-
misspelled_initials = [
113-
'Mlle', 'Mme', 'Ms', 'Dr',
114-
'Major', 'Lady', 'Countess',
115-
'Jonkheer', 'Col', 'Rev',
116-
'Capt', 'Sir', 'Don'
117-
]
118-
correct_initials = [
119-
'Miss', 'Miss', 'Miss', 'Mr',
120-
'Mr', 'Mrs', 'Mrs',
121-
'Other', 'Other', 'Other',
122-
'Mr', 'Mr', 'Mr'
123-
]
124-
for index, dataset in enumerate(datasets_list):
125-
dataset = dataset.replace(misspelled_initials, correct_initials)
126-
datasets_list[index] = dataset
127-
128-
129-
initials_age = {"Miss": 22,
130-
"Other": 46,
131-
"Master": 5,
132-
"Mr": 33,
133-
"Mrs": 36}
134-
for index, dataset in enumerate(datasets_list):
135-
for initial, initial_age in initials_age.items():
136-
dataset = dataset.withColumn(
137-
"Age",
138-
when((dataset["Initial"] == initial) &
139-
(dataset["Age"].isNull()), initial_age).otherwise(
140-
dataset["Age"]))
141-
datasets_list[index] = dataset
142-
143-
144-
for index, dataset in enumerate(datasets_list):
145-
dataset = dataset.na.fill({"Embarked": 'S'})
146-
datasets_list[index] = dataset
147-
148-
149-
for index, dataset in enumerate(datasets_list):
150-
dataset = dataset.withColumn("Family_Size", col('SibSp')+col('Parch'))
151-
dataset = dataset.withColumn('Alone', lit(0))
152-
dataset = dataset.withColumn(
153-
"Alone",
154-
when(dataset["Family_Size"] == 0, 1).otherwise(dataset["Alone"]))
155-
datasets_list[index] = dataset
156-
157-
158-
text_fields = ["Sex", "Embarked", "Initial"]
159-
for column in text_fields:
160-
for index, dataset in enumerate(datasets_list):
161-
dataset = StringIndexer(
162-
inputCol=column, outputCol=column+"_index").\
163-
fit(dataset).\
164-
transform(dataset)
165-
datasets_list[index] = dataset
166-
167-
168-
non_required_columns = ["Name", "Embarked", "Sex", "Initial"]
169-
for index, dataset in enumerate(datasets_list):
170-
dataset = dataset.drop(*non_required_columns)
171-
datasets_list[index] = dataset
172-
173-
174-
training_df = datasets_list[TRAINING_DF_INDEX]
175-
testing_df = datasets_list[TESTING_DF_INDEX]
176-
177-
assembler = VectorAssembler(
178-
inputCols=training_df.columns[:],
179-
outputCol="features")
180-
assembler.setHandleInvalid('skip')
181-
182-
features_training = assembler.transform(training_df)
183-
(features_training, features_evaluation) =\
184-
features_training.randomSplit([0.8, 0.2], seed=33)
185-
features_testing = assembler.transform(testing_df)
186-
'''
187-
188-
model_builder = Model()
189-
190-
print(model_builder.create_model(
191-
"titanic_training_projection",
192-
"titanic_testing_projection",
193-
preprocessing_code,
194-
["lr", "dt", "gb", "rf", "nb"]))
195-
``` "Fare": "number",
73+
"Fare": "number",
19674
"Parch": "number",
19775
"PassengerId": "number",
19876
"Pclass": "number",
@@ -318,7 +196,7 @@ print(model_builder.create_model(
318196
preprocessing_code,
319197
["lr", "dt", "gb", "rf", "nb"]))
320198
```
321-
199+
# Function APIs
322200

323201
## DatabaseApi
324202

0 commit comments

Comments
 (0)