Data Cleaning and Preprocessing
Example Prompt:
Example Response:
import pandas as pd
df = pd.read_csv('your_file.csv')
print(df.head())
print(df.info())
df.isnull().sum() # Check for missing values
df.fillna(method='ffill', inplace=True) # Forward fill missing values
df.drop_duplicates(inplace=True)
df['date_column'] = pd.to_datetime(df['date_column']) # Convert to datetime
df['category_column'] = df['category_column'].astype('category') # Convert to category
df['text_column'] = df['text_column'].str.lower() # Convert to lowercase
df = df[(df['numeric_column'] >= lower_bound) & (df['numeric_column'] <= upper_bound)] # Filter out outliers
Related Tasks:
Building Predictive Models
Example Prompt:
Example Response:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv('your_file.csv')
X = df.drop('target_column', axis=1) # Replace 'target_column' with your actual target column name
y = df['target_column']
# If you have categorical variables, convert them to dummy variables
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
Related Tasks:
Conducting Hypothesis Testing
Example Prompt:
Example Response:
from scipy import stats
t_stat, p_value = stats.ttest_ind(group1, group2) # For independent t-test
u_stat, p_value = stats.mannwhitneyu(group1, group2) # For Mann-Whitney U test
from scipy.stats import chi2_contingency
contingency_table = [[success_group1, failure_group1], [success_group2, failure_group2]]
chi2, p, dof, expected = chi2_contingency(contingency_table)
Related Tasks:
Creating Visualizations
Example Prompt:
Example Response:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('your_file.csv')
print(df.columns)
# Using seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='variable_x', y='variable_y', hue='category_column') # Replace with your actual column names
plt.title('Scatter Plot of Variable X vs Variable Y')
plt.xlabel('Variable X')
plt.ylabel('Variable Y')
plt.legend(title='Category')
plt.show()
# Using matplotlib
plt.scatter(df['variable_x'], df['variable_y'], c=df['category_column'], cmap='viridis')
plt.title('Scatter Plot of Variable X vs Variable Y')
plt.xlabel('Variable X')
plt.ylabel('Variable Y')
plt.colorbar(label='Category')
plt.show()
Related Tasks:
Preparing Reports and Presentations
Example Prompt:
Example Response:
Related Tasks: