Perform Data Cleaning
Example Prompt:
Example Response:
import pandas as pd
df = pd.read_csv('your_file.csv')
print(df.head())
print(df.info())
# Check for missing values
print(df.isnull().sum())
# Fill missing values with the mean (for numerical columns)
df['column_name'].fillna(df['column_name'].mean(), inplace=True)
# Drop rows with missing values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['date_column'] = pd.to_datetime(df['date_column'])
df['category_column'] = df['category_column'].str.lower().str.strip()
from scipy import stats
df = df[(np.abs(stats.zscore(df['numerical_column'])) < 3)]
print(df.info())
print(df.describe())
Related Tasks:
Conduct Statistical Tests
Example Prompt:
Example Response:
from scipy import stats
group1 = df[df['group'] == 'A']['value']
group2 = df[df['group'] == 'B']['value']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f'T-statistic: {t_stat}, P-value: {p_value}')
from scipy import stats
f_stat, p_value = stats.f_oneway(df[df['group'] == 'A']['value'], df[df['group'] == 'B']['value'], df[df['group'] == 'C']['value'])
print(f'F-statistic: {f_stat}, P-value: {p_value}')
correlation = df['variable1'].corr(df['variable2'])
print(f'Correlation: {correlation}')
contingency_table = pd.crosstab(df['category1'], df['category2'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print(f'Chi-squared: {chi2}, P-value: {p}')
import statsmodels.api as sm
X = df[['predictor1', 'predictor2']] # Independent variables
y = df['outcome'] # Dependent variable
X = sm.add_constant(X) # Adds a constant term to the predictor
model = sm.OLS(y, X).fit()
print(model.summary())
Related Tasks:
Create Visualizations
Example Prompt:
Example Response:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('your_file.csv')
summary_df = df.groupby('category')['sales'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(x='category', y='sales', data=summary_df)
plt.title('Average Sales by Category')
plt.xlabel('Category')
plt.ylabel('Average Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Related Tasks:
Generate Analytical Reports
Example Prompt:
Example Response:
Related Tasks:
Build Predictive Models
Example Prompt:
Example Response:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv('your_file.csv')
X = df[['feature1', 'feature2', 'feature3']] # Independent variables
y = df['target_variable'] # Dependent variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
Related Tasks: