# Handling missing data df['col'].fillna(method='ffill', inplace=True) # Forward fill df.dropna(subset=['col'], inplace=True) # Drop rows with NaN in 'col' # Basic filtering df[(df['col1'] > value) & (df['col2'] < value)] # Conditional filtering df[['col1', 'col2']] # Select specific columns # Date handling df['date_col'] = pd.to_datetime(df['date_col']) # Convert to datetime df['month'], df['day'], df['hour'] = df['date_col'].dt.month, df['date_col'].dt.day, df['date_col'].dt.hour # Extract features df.iloc[10:20] # Rows 10-20 df['col'].unique() # Unique values in a column arr = np.array([1, 2, 3]) # Create numpy array arr * 2 # Element-wise multiplication df = pd.get_dummies(df, columns=['categorical_col'], drop_first=True) # One-hot encoding # Scaling features scaler = StandardScaler() # Standardization df[['feat1', 'feat2']] = scaler.fit_transform(df[['feat1', 'feat2']]) # Train-test split X, y = df.drop('target', axis=1), df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) lin_reg = LinearRegression() # Initialize lin_reg.fit(X_train, y_train) # Train y_pred = lin_reg.predict(X_test) # Predict print("MSE:", mean_squared_error(y_test, y_pred)) # Evaluate #KFold kf = KFold(n_splits=n_fold, random_state=random_state, shuffle=True) for i, (train, test) in enumerate(kf.split(X, y)): X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] model = LinearRegression().fit(X_train, y_train) # Logistic Regression model log_reg = LogisticRegression() # Initialize log_reg.fit(X_train, y_train) # Train print("Accuracy:", accuracy_score(y_test, log_reg.predict(X_test))) # Predict & Evaluate df = pd.get_dummies(df, columns=['col1', 'col2'], drop_first=True) # One-hot encoding # One Standard error acc_mean = np.mean(accuracies, axis=1) acc_se = np.std(accuracies, axis=1, ddof=1) / np.sqrt(accuracies.shape[1]) best_mean_acc = np.max(acc_mean) best_model_idx = np.argmax(acc_mean) one_se_threshold = best_mean_acc - acc_se[best_model_idx] simplest_model_idx = np.argmax(acc_mean >= one_se_threshold) # Setting up a neural network model = Sequential() model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],))) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)) # Setting up a support vector machine with grid search CV svm = SVC(C=1.0, kernel='rbf') param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']} grid_search = GridSearchCV(svm, param_grid, cv=5) grid_search.fit(X_train, y_train)