Data Cleaning and Preprocessing
Example Prompt:
Example Response:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Load the dataset
file_path = 'path_to_your_file.csv'
df = pd.read_csv(file_path)
# Check for missing values
print(df.isnull().sum())
# Handling missing values: Filling with mean
for column in df.select_dtypes(include=['float64', 'int64']).columns:
df[column].fillna(df[column].mean(), inplace=True)
# Alternatively, you can drop missing values
# df.dropna(inplace=True)
# Normalizing numerical columns
scaler = MinMaxScaler()
normalized_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[normalized_columns] = scaler.fit_transform(df[normalized_columns])
# Display the cleaned data
print(df.head())
Related Tasks:
Building Predictive Models
Example Prompt:
Example Response:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# Load the dataset
file_path = 'path_to_your_file.csv'
df = pd.read_csv(file_path)
# Assuming 'target' is the name of the target variable
X = df.drop('target', axis=1) # Features
y = df['target'] # Target variable
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model to the training data
rf_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Related Tasks:
Creating Visualizations for Insights
Example Prompt:
Example Response:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
file_path = 'path_to_your_file.csv'
df = pd.read_csv(file_path)
# Specify the two variables you want to analyze
x_variable = 'variable_x'
y_variable = 'variable_y'
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x=x_variable, y=y_variable)
plt.title('Scatter Plot of {} vs {}'.format(x_variable, y_variable))
plt.xlabel(x_variable)
plt.ylabel(y_variable)
plt.grid(True)
plt.show()
Related Tasks:
Mentoring Junior Data Scientists
Example Prompt:
Example Response:
Related Tasks:
Presenting Data Insights to Stakeholders
Example Prompt:
Example Response:
Related Tasks: