Код: Выделить всё
7mN, 0.03 mm/sec,
7mN, 0.05 mm/sec,
10mN,0.03 mm/sec,
10mN,0.05 mm/sec
Однако я хочу провести статистические тесты, чтобы убедиться, что мои результаты статистически значимы. Я мог бы подумать о парном T-тесте. Однако, когда я попытался проверить, нормально ли распределялась разница между силами «до» и «после», мой код сказал мне, что это не так. Однако я не использовал поправочный коэффициент. Я прилагаю следующий фрагмент кода, надеясь, что он даст читателям немного больше информации (не обращайте внимания на условие «После повторного применения липидов», поскольку в настоящее время меня это не беспокоит):
Код: Выделить всё
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro, levene, f_oneway
import warnings
warnings.filterwarnings('ignore')
def check_statistical_assumptions(csv_path, output_dir=None):
"""
Comprehensive check of statistical assumptions for friction difference data.
Args:
csv_path (str): Path to the CSV file containing all samples statistics
output_dir (str): Directory to save output plots and results. If None, displays plots.
Returns:
dict: Dictionary containing all test results
"""
# Read the data
df = pd.read_csv(csv_path)
# Calculate differences (Region 1 - Region 2)
pivot = df.pivot_table(
index=['Sample_Number', 'force', 'velocity', 'condition'],
columns='region',
values='mean'
).reset_index()
if 'Region 1' not in pivot.columns or 'Region 2' not in pivot.columns:
raise ValueError("Both 'Region 1' and 'Region 2' must be present in the data.")
pivot['difference'] = pivot['Region 1'] - pivot['Region 2']
# Dictionary to store all results
results = {
'normality_tests': {},
'homogeneity_tests': {},
'outlier_analysis': {},
'summary_statistics': {}
}
# Map condition names
condition_labels = {
'Before': 'Before solvent washing',
'After': 'After solvent washing',
'After lipid reapplication': 'After lipid reapplication'
}
conditions = ['Before', 'After', 'After lipid reapplication']
print("="*80)
print("STATISTICAL ASSUMPTIONS TESTING")
print("="*80)
# =========================================================================
# 1. NORMALITY TESTS
# =========================================================================
print("\n" + "="*80)
print("1. NORMALITY TESTS (Shapiro-Wilk Test)")
print("="*80)
print("Null Hypothesis: Data is normally distributed")
print("If p > 0.05: Data is likely normal (can use parametric tests)")
print("If p ≤ 0.05: Data is not normal (use non-parametric tests)")
print("-"*80)
# Test normality for each condition
for condition in conditions:
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) >= 3: # Shapiro-Wilk requires at least 3 samples
stat, p_value = shapiro(condition_data)
results['normality_tests'][condition] = {
'statistic': stat,
'p_value': p_value,
'n_samples': len(condition_data),
'is_normal': p_value > 0.05
}
display_label = condition_labels[condition]
print(f"\n{display_label}:")
print(f" Sample size: {len(condition_data)}")
print(f" Shapiro-Wilk statistic: {stat:.4f}")
print(f" p-value: {p_value:.4f}")
print(f" Result: {'✓ Normal distribution' if p_value > 0.05 else '✗ Not normally distributed'}")
else:
print(f"\n{condition_labels[condition]}: Insufficient data (n={len(condition_data)})")
# Test normality of DIFFERENCES between conditions (for paired tests)
print("\n" + "-"*80)
print("NORMALITY OF PAIRED DIFFERENCES:")
print("-"*80)
# Before vs After
before_data = pivot[pivot['condition'] == 'Before'].set_index(['Sample_Number', 'force', 'velocity'])['difference']
after_data = pivot[pivot['condition'] == 'After'].set_index(['Sample_Number', 'force', 'velocity'])['difference']
# Align the data
aligned = pd.DataFrame({'before': before_data, 'after': after_data}).dropna()
if len(aligned) >= 3:
diff_before_after = aligned['before'] - aligned['after']
stat, p_value = shapiro(diff_before_after)
results['normality_tests']['difference_before_after'] = {
'statistic': stat,
'p_value': p_value,
'n_samples': len(diff_before_after),
'is_normal': p_value > 0.05
}
print(f"\nBefore vs After (paired differences):")
print(f" Sample size: {len(diff_before_after)}")
print(f" Shapiro-Wilk statistic: {stat:.4f}")
print(f" p-value: {p_value:.4f}")
print(f" Result: {'✓ Normal distribution' if p_value > 0.05 else '✗ Not normally distributed'}")
# After vs After lipid reapplication
reapp_data = pivot[pivot['condition'] == 'After lipid reapplication'].set_index(['Sample_Number', 'force', 'velocity'])['difference']
aligned2 = pd.DataFrame({'after': after_data, 'reapp': reapp_data}).dropna()
if len(aligned2) >= 3:
diff_after_reapp = aligned2['after'] - aligned2['reapp']
stat, p_value = shapiro(diff_after_reapp)
results['normality_tests']['difference_after_reapp'] = {
'statistic': stat,
'p_value': p_value,
'n_samples': len(diff_after_reapp),
'is_normal': p_value > 0.05
}
print(f"\nAfter vs After lipid reapplication (paired differences):")
print(f" Sample size: {len(diff_after_reapp)}")
print(f" Shapiro-Wilk statistic: {stat:.4f}")
print(f" p-value: {p_value:.4f}")
print(f" Result: {'✓ Normal distribution' if p_value > 0.05 else '✗ Not normally distributed'}")
# =========================================================================
# 2. HOMOGENEITY OF VARIANCE (Levene's Test)
# =========================================================================
print("\n" + "="*80)
print("2. HOMOGENEITY OF VARIANCE (Levene's Test)")
print("="*80)
print("Null Hypothesis: Variances are equal across groups")
print("If p > 0.05: Variances are equal (homoscedasticity)")
print("If p ≤ 0.05: Variances are unequal (heteroscedasticity)")
print("-"*80)
# Prepare data for Levene's test
groups = [pivot[pivot['condition'] == cond]['difference'].dropna() for cond in conditions]
groups = [g for g in groups if len(g) > 0] # Remove empty groups
if len(groups) >= 2:
stat, p_value = levene(*groups)
results['homogeneity_tests']['levene'] = {
'statistic': stat,
'p_value': p_value,
'equal_variance': p_value > 0.05
}
print(f"\nLevene's test across all conditions:")
print(f" Levene statistic: {stat:.4f}")
print(f" p-value: {p_value:.4f}")
print(f" Result: {'✓ Equal variances' if p_value > 0.05 else '✗ Unequal variances'}")
# =========================================================================
# 3. OUTLIER DETECTION
# =========================================================================
print("\n" + "="*80)
print("3. OUTLIER DETECTION (IQR Method)")
print("="*80)
print("Outliers defined as: Q1 - 1.5*IQR or Q3 + 1.5*IQR")
print("-"*80)
for condition in conditions:
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) > 0:
Q1 = condition_data.quantile(0.25)
Q3 = condition_data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = condition_data[(condition_data < lower_bound) | (condition_data > upper_bound)]
results['outlier_analysis'][condition] = {
'Q1': Q1,
'Q3': Q3,
'IQR': IQR,
'lower_bound': lower_bound,
'upper_bound': upper_bound,
'n_outliers': len(outliers),
'outlier_values': outliers.tolist()
}
display_label = condition_labels[condition]
print(f"\n{display_label}:")
print(f" Q1: {Q1:.4f}")
print(f" Q3: {Q3:.4f}")
print(f" IQR: {IQR:.4f}")
print(f" Lower bound: {lower_bound:.4f}")
print(f" Upper bound: {upper_bound:.4f}")
print(f" Number of outliers: {len(outliers)}")
if len(outliers) > 0:
print(f" Outlier values: {[f'{x:.4f}' for x in outliers.tolist()]}")
# =========================================================================
# 4. SUMMARY STATISTICS
# =========================================================================
print("\n" + "="*80)
print("4. DESCRIPTIVE STATISTICS")
print("="*80)
for condition in conditions:
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) > 0:
results['summary_statistics'][condition] = {
'n': len(condition_data),
'mean': condition_data.mean(),
'median': condition_data.median(),
'std': condition_data.std(),
'sem': condition_data.sem(),
'min': condition_data.min(),
'max': condition_data.max(),
'cv': (condition_data.std() / condition_data.mean() * 100) if condition_data.mean() != 0 else np.nan
}
display_label = condition_labels[condition]
print(f"\n{display_label}:")
print(f" N: {len(condition_data)}")
print(f" Mean: {condition_data.mean():.4f}")
print(f" Median: {condition_data.median():.4f}")
print(f" Std Dev: {condition_data.std():.4f}")
print(f" SEM: {condition_data.sem():.4f}")
print(f" Min: {condition_data.min():.4f}")
print(f" Max: {condition_data.max():.4f}")
print(f" CV (%): {(condition_data.std() / condition_data.mean() * 100):.2f}" if condition_data.mean() != 0 else " CV (%): N/A")
# =========================================================================
# 5. VISUAL DIAGNOSTICS
# =========================================================================
print("\n" + "="*80)
print("5. GENERATING DIAGNOSTIC PLOTS")
print("="*80)
# Create figure with subplots
fig = plt.figure(figsize=(24, 16))
# 5.1 Q-Q Plots (Normality check) - Individual Conditions
for idx, condition in enumerate(conditions, 1):
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) > 2:
ax = plt.subplot(4, 5, idx)
stats.probplot(condition_data, dist="norm", plot=ax)
ax.set_title(f'Q-Q Plot: {condition_labels[condition]}', fontsize=9)
ax.grid(True, alpha=0.3)
# 5.1b Q-Q Plots for PAIRED DIFFERENCES (MOST IMPORTANT!)
plot_idx = 4
if len(aligned) >= 3:
ax = plt.subplot(4, 5, plot_idx)
diff_before_after = aligned['before'] - aligned['after']
stats.probplot(diff_before_after, dist="norm", plot=ax)
ax.set_title('Q-Q Plot: Paired Diff (Before - After)', fontsize=9, fontweight='bold', color='red')
ax.grid(True, alpha=0.3)
plot_idx += 1
if len(aligned2) >= 3:
ax = plt.subplot(4, 5, plot_idx)
diff_after_reapp = aligned2['after'] - aligned2['reapp']
stats.probplot(diff_after_reapp, dist="norm", plot=ax)
ax.set_title('Q-Q Plot: Paired Diff (After - Reapp)', fontsize=9, fontweight='bold', color='red')
ax.grid(True, alpha=0.3)
# 5.2 Histograms with normal curve overlay - Individual Conditions
for idx, condition in enumerate(conditions, 1):
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) > 0:
ax = plt.subplot(4, 5, idx + 5)
# Plot histogram
n, bins, patches = ax.hist(condition_data, bins='auto', density=True, alpha=0.7, color='skyblue', edgecolor='black')
# Overlay normal distribution
mu, sigma = condition_data.mean(), condition_data.std()
x = np.linspace(condition_data.min(), condition_data.max(), 100)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal curve')
ax.set_title(f'Histogram: {condition_labels[condition]}', fontsize=9)
ax.set_xlabel('Friction Difference (mN)', fontsize=8)
ax.set_ylabel('Density', fontsize=8)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
# 5.2b Histograms for PAIRED DIFFERENCES (MOST IMPORTANT!)
plot_idx = 9
if len(aligned) >= 3:
ax = plt.subplot(4, 5, plot_idx)
diff_before_after = aligned['before'] - aligned['after']
n, bins, patches = ax.hist(diff_before_after, bins='auto', density=True, alpha=0.7, color='lightcoral', edgecolor='black')
mu, sigma = diff_before_after.mean(), diff_before_after.std()
if len(diff_before_after) > 1:
x = np.linspace(diff_before_after.min(), diff_before_after.max(), 100)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal curve')
ax.set_title('Histogram: Paired Diff (Before - After)', fontsize=9, fontweight='bold', color='red')
ax.set_xlabel('Paired Difference (mN)', fontsize=8)
ax.set_ylabel('Density', fontsize=8)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
plot_idx += 1
if len(aligned2) >= 3:
ax = plt.subplot(4, 5, plot_idx)
diff_after_reapp = aligned2['after'] - aligned2['reapp']
n, bins, patches = ax.hist(diff_after_reapp, bins='auto', density=True, alpha=0.7, color='lightgreen', edgecolor='black')
mu, sigma = diff_after_reapp.mean(), diff_after_reapp.std()
if len(diff_after_reapp) > 1:
x = np.linspace(diff_after_reapp.min(), diff_after_reapp.max(), 100)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal curve')
ax.set_title('Histogram: Paired Diff (After - Reapp)', fontsize=9, fontweight='bold', color='red')
ax.set_xlabel('Paired Difference (mN)', fontsize=8)
ax.set_ylabel('Density', fontsize=8)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
# 5.3 Box plots
ax = plt.subplot(4, 5, 11)
box_data = [pivot[pivot['condition'] == cond]['difference'].dropna() for cond in conditions]
box_labels = [condition_labels[cond] for cond in conditions]
bp = ax.boxplot(box_data, labels=box_labels, patch_artist=True)
for patch, color in zip(bp['boxes'], ['skyblue', 'lightcoral', 'lightgreen']):
patch.set_facecolor(color)
ax.set_title('Box Plot: All Conditions', fontsize=9)
ax.set_ylabel('Friction Difference (mN)', fontsize=8)
ax.tick_params(axis='x', rotation=45, labelsize=7)
ax.grid(True, alpha=0.3, axis='y')
# 5.4 Violin plot
ax = plt.subplot(4, 5, 12)
plot_df = pivot[['condition', 'difference']].dropna()
plot_df['condition_label'] = plot_df['condition'].map(condition_labels)
parts = ax.violinplot([plot_df[plot_df['condition'] == cond]['difference'] for cond in conditions],
positions=range(len(conditions)),
showmeans=True,
showmedians=True)
ax.set_xticks(range(len(conditions)))
ax.set_xticklabels([condition_labels[cond] for cond in conditions], rotation=45, ha='right', fontsize=7)
ax.set_title('Violin Plot: All Conditions', fontsize=9)
ax.set_ylabel('Friction Difference (mN)', fontsize=8)
ax.grid(True, alpha=0.3, axis='y')
# 5.5 Individual data points with mean ± SEM
ax = plt.subplot(4, 5, 13)
for idx, condition in enumerate(conditions):
condition_data = pivot[pivot['condition'] == condition]['difference'].dropna()
if len(condition_data) > 0:
# Plot individual points with jitter
x = np.random.normal(idx, 0.04, size=len(condition_data))
ax.scatter(x, condition_data, alpha=0.6, s=50)
# Plot mean with error bars
mean = condition_data.mean()
sem = condition_data.sem()
ax.errorbar(idx, mean, yerr=sem, fmt='ro', markersize=10, capsize=5, capthick=2, linewidth=2)
ax.set_xticks(range(len(conditions)))
ax.set_xticklabels([condition_labels[cond] for cond in conditions], rotation=45, ha='right', fontsize=7)
ax.set_title('Individual Data Points with Mean ± SEM', fontsize=9)
ax.set_ylabel('Friction Difference (mN)', fontsize=8)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3, axis='y')
# 5.6 Paired differences scatter plot (Before - After)
if len(aligned) >= 3:
ax = plt.subplot(4, 5, 14)
diff_before_after = aligned['before'] - aligned['after']
ax.scatter(range(len(diff_before_after)), diff_before_after, alpha=0.7, s=60, color='darkred')
ax.axhline(y=0, color='black', linestyle='--', linewidth=2)
ax.axhline(y=diff_before_after.mean(), color='red', linestyle='-', linewidth=2, label=f'Mean = {diff_before_after.mean():.3f}')
ax.set_title('Paired Differences: Before - After', fontsize=9, fontweight='bold')
ax.set_xlabel('Observation Index', fontsize=8)
ax.set_ylabel('Difference (mN)', fontsize=8)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
# 5.7 Paired differences scatter plot (After - Reapp)
if len(aligned2) >= 3:
ax = plt.subplot(4, 5, 15)
diff_after_reapp = aligned2['after'] - aligned2['reapp']
ax.scatter(range(len(diff_after_reapp)), diff_after_reapp, alpha=0.7, s=60, color='darkgreen')
ax.axhline(y=0, color='black', linestyle='--', linewidth=2)
ax.axhline(y=diff_after_reapp.mean(), color='green', linestyle='-', linewidth=2, label=f'Mean = {diff_after_reapp.mean():.3f}')
ax.set_title('Paired Differences: After - Reapplication', fontsize=9, fontweight='bold')
ax.set_xlabel('Observation Index', fontsize=8)
ax.set_ylabel('Difference (mN)', fontsize=8)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
# 5.8 Sample size summary
ax = plt.subplot(4, 5, 16)
ax.axis('off')
summary_text = "SAMPLE SIZE SUMMARY\n" + "="*30 + "\n\n"
for condition in conditions:
n = len(pivot[pivot['condition'] == condition]['difference'].dropna())
summary_text += f"{condition_labels[condition]}:\n n = {n}\n\n"
if len(aligned) >= 3:
summary_text += f"Paired (Before-After):\n n = {len(aligned)}\n\n"
if len(aligned2) >= 3:
summary_text += f"Paired (After-Reapp):\n n = {len(aligned2)}\n"
ax.text(0.1, 0.9, summary_text, transform=ax.transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 5.9 Recommendation summary
ax = plt.subplot(4, 5, 17)
ax.axis('off')
# Determine recommendations
all_normal = all([results['normality_tests'].get(cond, {}).get('is_normal', False)
for cond in conditions if cond in results['normality_tests']])
diff_normal = results['normality_tests'].get('difference_before_after', {}).get('is_normal', False)
equal_var = results['homogeneity_tests'].get('levene', {}).get('equal_variance', False)
recommendation_text = "STATISTICAL TEST RECOMMENDATIONS\n" + "="*40 + "\n\n"
if diff_normal:
recommendation_text += "✓ Paired t-test\n"
recommendation_text += " (Paired differences are normal)\n\n"
else:
recommendation_text += "✓ Wilcoxon signed-rank test\n"
recommendation_text += " (Paired differences not normal)\n\n"
if all_normal and equal_var:
recommendation_text += "✓ One-way repeated measures ANOVA\n"
recommendation_text += " (For comparing all 3 conditions)\n\n"
else:
recommendation_text += "✓ Friedman test\n"
recommendation_text += " (Non-parametric alternative)\n\n"
recommendation_text += "\nNote: With small sample size,\n"
recommendation_text += "consider reporting effect sizes\n"
recommendation_text += "and confidence intervals."
ax.text(0.1, 0.9, recommendation_text, transform=ax.transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
plt.tight_layout()
# Save or display
if output_dir:
import os
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'statistical_assumptions_diagnostic_plots.png')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"\nDiagnostic plots saved to: {output_path}")
else:
plt.show()
plt.close()
# =========================================================================
# FINAL RECOMMENDATIONS
# =========================================================================
print("\n" + "="*80)
print("FINAL RECOMMENDATIONS")
print("="*80)
if diff_normal:
print("\n✓ Use PAIRED T-TEST for comparing Before vs After")
print(" Reason: Paired differences are normally distributed")
else:
print("\n✓ Use WILCOXON SIGNED-RANK TEST for comparing Before vs After")
print(" Reason: Paired differences are not normally distributed")
if all_normal and equal_var:
print("\n✓ Use ONE-WAY REPEATED MEASURES ANOVA for comparing all 3 conditions")
print(" Reason: Data meets parametric test assumptions")
else:
print("\n✓ Use FRIEDMAN TEST for comparing all 3 conditions")
print(" Reason: Data does not meet all parametric test assumptions")
print("\n⚠ IMPORTANT NOTES:")
print(" - With limited sample size, statistical power may be low")
print(" - Report effect sizes (Cohen's d) alongside p-values")
print(" - Consider reporting confidence intervals")
print(" - Be cautious about multiple comparisons (apply corrections if needed)")
print("\n" + "="*80)
return results
Код: Выделить всё
Before vs After (paired differences):
Sample size: 40
Shapiro-Wilk statistic: 0.9435
p-value: 0.0455
Result: ✗ Not normally distributed
Любая информация/предложения/помощь будут с благодарностью приняты. Я добавил сюжеты, чтобы читатели могли ознакомиться с этим постом, надеясь, что они могут быть полезны. Я более чем рад поделиться более подробной информацией (в той степени, в которой это не нарушает положения о конфиденциальности моего исследования).
Спасибо

Подробнее здесь: https://stackoverflow.com/questions/797 ... measures-f