import mlflow import mlflow.sklearn from sklearn.m...
# ml-and-mlops-questions
e
import mlflow import mlflow.sklearn from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report # Function to log preprocessing steps def log_preprocessing(preprocessing_steps): mlflow.log_dict(preprocessing_steps, "preprocessing_steps.json") # Define a function to handle model training, evaluation, and logging def train_evaluate_log_model(model, model_name, X_train, X_test, y_train, y_test, params, preprocessing_steps): # Log preprocessing steps log_preprocessing(preprocessing_steps) # Train the model model.fit(X_train, y_train) # Predict and evaluate y_pred = model.predict(X_test) print(f"{model_name}:") print(classification_report(y_test, y_pred)) # Generate classification report as a dictionary class_report = classification_report(y_test, y_pred, output_dict=True) # Log with MLflow with mlflow.start_run(): mlflow.log_params(params) mlflow.log_metrics({ 'accuracy': class_report['accuracy'], 'recall_class_0': class_report['0']['recall'], 'recall_class_1': class_report['1']['recall'], 'f1_score': class_report['macro avg']['f1-score'] }) mlflow.sklearn.log_model(model, model_name) print(f"Model {model_name} logged successfully.\n") # Load and preprocess data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=102) # Preprocessing steps scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Record preprocessing steps preprocessing_steps = { "scaling": { "method": "StandardScaler", "mean": scaler.mean_.tolist(), # mean used for scaling "var": scaler.var_.tolist(), # variance used for scaling }, "train_test_split": { "test_size": 0.2, "random_state": 102 } } # Set up MLflow experiment mlflow.set_experiment("cancer_data") mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/") # Logistic Regression logistic_params = { "solver": "lbfgs", "max_iter": 10000, "multi_class": "auto", "random_state": 8888, } logistic_model = LogisticRegression(**logistic_params) train_evaluate_log_model(logistic_model, "Logistic Regression", X_train, X_test, y_train, y_test, logistic_params, preprocessing_steps) # Decision Tree Classifier dt_params = { "random_state": 8888, } dt_model = DecisionTreeClassifier(**dt_params) train_evaluate_log_model(dt_model, "Decision Tree Classifier", X_train, X_test, y_train, y_test, dt_params, preprocessing_steps) # You can add other models like SVM, RandomForest, and XGBoost similarly: # Example: # from sklearn.ensemble import RandomForestClassifier # rf_params = {"n_estimators": 100, "random_st