Python

Introduction

Usage Guide

Pymaceuticals Project

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import statistics
from statistics import variance
from statistics import stdev
from scipy.stats import sem

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
dataset1 = pd.merge(mouse_metadata,study_results, on="Mouse ID")
dataset1 = dataset1[['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)']]
# Display the data table for preview
dataset1.to_csv('mousedata.csv', index=False, header=True)
# Checking the number of mice.
Num_of_Mice = dataset1['Mouse ID'].unique()
#Num_of_Mice = dataset1['Mouse ID'].value_counts()
Num_of_Mice2 = sum(pd.value_counts(Num_of_Mice))
Num_of_Mice2

# Create a clean DataFrame by dropping the duplicate mouse by its ID.
dataset2 = dataset1.drop(dataset1[dataset1['Mouse ID'] == 'g989'].index)
dataset2

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

dataset3 = dataset2.sort_values(by='Drug Regimen',ascending=True)
dataset3

#Groupby then summary stats / Calculating Mean categorized by drug type
Mean_TV = dataset3.groupby(['Drug Regimen']).mean(['Tumor Volume (mm3)'])
Mean_TV = Mean_TV.rename(columns={'Tumor Volume (mm3)':'Mean Tumor Volume'})
Mean_TV = Mean_TV.drop(columns={'Timepoint','Metastatic Sites', 'Age_months', 'Weight (g)'})
Mean_TV
#Median Tumor Volume
Median_TV = dataset3.groupby(['Drug Regimen']).median(['Tumor Volume (mm3)'])
Median_TV = Median_TV.rename(columns={'Tumor Volume (mm3)':'Median Tumor Volume'})
Median_TV = Median_TV.drop(columns={'Timepoint','Metastatic Sites', 'Age_months', 'Weight (g)'})
Median_TV

#Tumor Volume Variance
Variance_TV = dataset3.groupby(['Drug Regimen'])
#Variance_TV = float(Variance_TV['Tumor Volume (mm3)']) #Variance_TV.var(['Tumor Volume (mm3)'])
Variance_TV

dataset4 = dataset3['Drug Regimen'].unique().tolist()
dataset4
#Getting the variance for each type of drug regimen
#if drug regimen = x, then take Tumor Volume and add to specific drug regimen list
Drug_Regimen = dataset3['Drug Regimen'].unique().tolist()
Drug_Regimen

#Volume list for drugs
Capomulin_volumes = []
Ceftamin_volumes = []
Infubinol_volumes = []
Ketapril_volumes = []
Naftisol_volumes = []
Placebo_volumes = []
Propriva_volumes = []
Ramicane_volumes = []
Stelasyn_volumes = []
Zoniferol_volumes = []

for index, row in dataset3.iterrows():
    current_drug = row['Drug Regimen']
    
     # Check if the current drug is 'Capomulin'
    if current_drug == 'Capomulin':
        # If 'Capomulin', add the volume to the list
        Capomulin_volumes.append(row['Tumor Volume (mm3)'])
    
    if current_drug == 'Ceftamin':
        # If 'Capomulin', add the volume to the list
        Ceftamin_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Infubinol':
        # If 'Capomulin', add the volume to the list
        Infubinol_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Ketapril':
        # If 'Capomulin', add the volume to the list
        Ketapril_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Naftisol':
        # If 'Capomulin', add the volume to the list
        Naftisol_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Placebo':
        # If 'Capomulin', add the volume to the list
        Placebo_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Propriva':
        # If 'Capomulin', add the volume to the list
        Propriva_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Ramicane':
        # If 'Capomulin', add the volume to the list
        Ramicane_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Stelasyn':
        # If 'Capomulin', add the volume to the list
        Stelasyn_volumes.append(row['Tumor Volume (mm3)'])

    if current_drug == 'Zoniferol':
        # If 'Capomulin', add the volume to the list
        Zoniferol_volumes.append(row['Tumor Volume (mm3)'])

# # Print the volumes of Capomulin
# print("Capomulin Volumes:", Capomulin_volumes)

Capomulin_volumes_v = variance(Capomulin_volumes)
Ceftamin_volumes_v = variance(Ceftamin_volumes)
Infubinol_volumes_v = variance(Infubinol_volumes)
Ketapril_volumes_v = variance(Ketapril_volumes)
Naftisol_volumes_v = variance(Naftisol_volumes)
Placebo_volumes_v = variance(Placebo_volumes)
Propriva_volumes_v = variance(Propriva_volumes)
Ramicane_volumes_v = variance(Ramicane_volumes)
Stelasyn_volumes_v = variance(Stelasyn_volumes)
Zoniferol_volumes_v = variance(Zoniferol_volumes)

Tumor_VV = [[Capomulin_volumes_v, Ceftamin_volumes_v, Infubinol_volumes_v,
              Ketapril_volumes_v, Naftisol_volumes_v, Placebo_volumes_v,
                Propriva_volumes_v, Ramicane_volumes_v, Stelasyn_volumes_v,
                  Zoniferol_volumes_v], ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo',
                                         'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']]

#Tumor_VV
Tumor_V_df = pd.DataFrame(Tumor_VV).transpose()
Tumor_V_df.columns = ['Tumor Volume Variance', 'Drug Regimen']
print((Tumor_V_df))

#Standard Deviation of : 
print("Variance of Capomulin:", stdev(Capomulin_volumes))
print("Variance of Ceftamin:", stdev(Ceftamin_volumes))
print("Variance of Infubinol:", stdev(Infubinol_volumes))
print("Variance of Ketapril:", stdev(Ketapril_volumes))
print("Variance of Naftisol:", stdev(Naftisol_volumes))
print("Variance of Placebo:", stdev(Placebo_volumes))
print("Variance of Propriva:", stdev(Propriva_volumes))
print("Variance of Ramicane:", stdev(Ramicane_volumes))
print("Variance of Stelasyn:", stdev(Stelasyn_volumes))
print("Variance of Zoniferol:", stdev(Zoniferol_volumes))

Capomulin_volumes_sd = stdev(Capomulin_volumes)
Ceftamin_volumes_sd = stdev(Ceftamin_volumes)
Infubinol_volumes_sd = stdev(Infubinol_volumes)
Ketapril_volumes_sd = stdev(Ketapril_volumes)
Naftisol_volumes_sd = stdev(Naftisol_volumes)
Placebo_volumes_sd = stdev(Placebo_volumes)
Propriva_volumes_sd = stdev(Propriva_volumes)
Ramicane_volumes_sd = stdev(Ramicane_volumes)
Stelasyn_volumes_sd = stdev(Stelasyn_volumes)
Zoniferol_volumes_sd = stdev(Zoniferol_volumes)

Tumor_sd = [[Capomulin_volumes_sd, Ceftamin_volumes_sd, Infubinol_volumes_sd,
              Ketapril_volumes_sd, Naftisol_volumes_sd, Placebo_volumes_sd,
                Propriva_volumes_sd, Ramicane_volumes_sd, Stelasyn_volumes_sd,
                  Zoniferol_volumes_sd], ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo',
                                         'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']]
Tumor_sd_df = pd.DataFrame(Tumor_sd).transpose()
Tumor_sd_df.columns = ['Tumor Volume Std. Dev.', 'Drug Regimen']
print((Tumor_sd_df))
#Standard error of the mean
print("Variance of Capomulin:", sem(Capomulin_volumes))
print("Variance of Ceftamin:", sem(Ceftamin_volumes))
print("Variance of Infubinol:", sem(Infubinol_volumes))
print("Variance of Ketapril:", sem(Ketapril_volumes))
print("Variance of Naftisol:", sem(Naftisol_volumes))
print("Variance of Placebo:", sem(Placebo_volumes))
print("Variance of Propriva:", sem(Propriva_volumes))
print("Variance of Ramicane:", sem(Ramicane_volumes))
print("Variance of Stelasyn:", sem(Stelasyn_volumes))
print("Variance of Zoniferol:", sem(Zoniferol_volumes))

Capomulin_volumes_sem = sem(Capomulin_volumes)
Ceftamin_volumes_sem = sem(Ceftamin_volumes)
Infubinol_volumes_sem = sem(Infubinol_volumes)
Ketapril_volumes_sem = sem(Ketapril_volumes)
Naftisol_volumes_sem = sem(Naftisol_volumes)
Placebo_volumes_sem = sem(Placebo_volumes)
Propriva_volumes_sem = sem(Propriva_volumes)
Ramicane_volumes_sem = sem(Ramicane_volumes)
Stelasyn_volumes_sem = sem(Stelasyn_volumes)
Zoniferol_volumes_sem = sem(Zoniferol_volumes)

Tumor_sem = [[Capomulin_volumes_sem, Ceftamin_volumes_sem, Infubinol_volumes_sem,
              Ketapril_volumes_sem, Naftisol_volumes_sem, Placebo_volumes_sem,
                Propriva_volumes_sem, Ramicane_volumes_sem, Stelasyn_volumes_sem,
                  Zoniferol_volumes_sem], ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo',
                                         'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']]
Tumor_sem_df = pd.DataFrame(Tumor_sem).transpose()
Tumor_sem_df.columns = ['Tumor Volume Std. Err.', 'Drug Regimen']
print((Tumor_sem_df))

#Merging the data
merged_df = pd.merge(Mean_TV,Median_TV, on="Drug Regimen")
merged_df = pd.merge(merged_df,Tumor_V_df, on="Drug Regimen")
merged_df = pd.merge(merged_df,Tumor_sd_df, on="Drug Regimen")
merged_df = pd.merge(merged_df,Tumor_sem_df, on="Drug Regimen")
merged_df = merged_df.set_index('Drug Regimen')
merged_df

# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)
dataset3a = dataset3
# Using the aggregation method, produce the same summary statistics in a single line
dataset3a = dataset3a.groupby('Drug Regimen')['Tumor Volume (mm3)'].aggregate(["mean","median","var","std","sem"])
dataset3a

# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
bp = dataset3['Drug Regimen'].value_counts().reset_index()
bp.columns = ['Drug Regimen', 'Mouse count']
# sorted(bp)
plot1 = bp.plot.bar(x='Drug Regimen', y='Mouse count', rot=90)
plot1.set_ylabel("# of Observed Mouse Timepoints")
plot1.plot(figsize =(8,8))
plot1

# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
fig = plt.figure(figsize = (10,10))
plt.bar(bp['Drug Regimen'], bp['Mouse count'], color = 'b', width = 0.5)
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.show()



# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
Final_tp_mouse2 = dataset2.groupby('Mouse ID')['Timepoint'].max()
Final_tp_mouse2 = Final_tp_mouse2.to_frame(name="Timepoint")

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Final_tp_mouse3 = pd.merge(Final_tp_mouse2, dataset2, how='left', left_on=['Mouse ID','Timepoint'],right_on=['Mouse ID','Timepoint'])
Final_tp_mouse3

#Put treatments into a list for for loop (and later for plot labels)
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatments_list = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

#Create empty list to fill with tumor vol data (for plotting)
tumor_volume_list = []

# # Create empty list to store IQR values for each treatment
iqr_values = []

#Store outliers
outliers1 = []

# # For loop list of treatments
for treatment in treatments_list:
    final_treatment = Final_tp_mouse3.loc[Final_tp_mouse3['Drug Regimen'] == treatment]
    tumor_volume_list.append(final_treatment['Tumor Volume (mm3)'])
    tumor_volume_check = final_treatment['Tumor Volume (mm3)']

#Calculate the IQR for the current treatment
    quartiles = tumor_volume_check.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
# Determine outliers using upper and lower bounds
    
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)

# Identify potential outliers
    outliers = tumor_volume_check[(tumor_volume_check < lower_bound) | (tumor_volume_check > upper_bound)]
    outliers1.append(f"{treatment}'s potential outliers: {outliers} ")
    
print(*outliers1, sep = "\n")

# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
fig, ax = plt.subplots(figsize =(10, 7))

# Creating plot
bp = ax.boxplot(tumor_volume_list, labels=treatments_list, flierprops = dict(marker = "o", markerfacecolor = "red"))

# Add a title and labels to the plot
ax.set_ylabel('Final Tumor Volume (mm3)')

# Display the plot
plt.show()

Capo_mouse1 = dataset1.loc[dataset1['Mouse ID'] == "s185"]
Capo_mouse1

# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
capomulin_line = plt.plot(Capo_mouse1['Timepoint'], Capo_mouse1['Tumor Volume (mm3)'])
plt.show()

average_observed_TV_Capo = dataset1.loc[dataset1['Drug Regimen']=="Capomulin"]
average_observed_TV_Capo = average_observed_TV_Capo.groupby('Mouse ID').mean('Tumor Volume (mm3)')
average_observed_TV_Capo

# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
average_observed_TV_Capo.plot(kind="scatter", x="Weight (g)", y="Tumor Volume (mm3)", grid=True, figsize=(8,8),
              title="Capomulin")
plt.show()

from scipy.stats import linregress

# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
x_values = average_observed_TV_Capo['Weight (g)']
y_values = average_observed_TV_Capo["Tumor Volume (mm3)"]

#calculating correlation coefficient
correlation = st.pearsonr(x_values,y_values)
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")

#printing correlation line
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel('Weight')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()
  Tumor Volume Variance Drug Regimen
0             24.947764    Capomulin
1             39.290177     Ceftamin
2             43.128684    Infubinol
3             68.553577     Ketapril
4             66.173479     Naftisol
5             61.168083      Placebo
6             43.852013     Propriva
7             23.486704     Ramicane
8             59.450562     Stelasyn
9             48.533355    Zoniferol
Variance of Capomulin: 4.9947736805840215
Variance of Ceftamin: 6.2681877184141985
Variance of Infubinol: 6.5672432670669405
Variance of Ketapril: 8.279708757706759
Variance of Naftisol: 8.134708291473338
Variance of Placebo: 7.821002681031188
Variance of Propriva: 6.622085246583617
Variance of Ramicane: 4.846308280753017
Variance of Stelasyn: 7.7104190335782645
Variance of Zoniferol: 6.966588504381904
  Tumor Volume Std. Dev. Drug Regimen
0               4.994774    Capomulin
1               6.268188     Ceftamin
2               6.567243    Infubinol
3               8.279709     Ketapril
4               8.134708     Naftisol
5               7.821003      Placebo
6               6.622085     Propriva
7               4.846308     Ramicane
8               7.710419     Stelasyn
9               6.966589    Zoniferol
Variance of Capomulin: 0.32934562340083096
Variance of Ceftamin: 0.46982053275261093
Variance of Infubinol: 0.49223569380113824
Variance of Ketapril: 0.6038598237739697
Variance of Naftisol: 0.5964657512424235
Variance of Placebo: 0.5813305510593877
Variance of Propriva: 0.544332054194047
Variance of Ramicane: 0.3209546065084817
Variance of Stelasyn: 0.5731109332771458
Variance of Zoniferol: 0.5163978968332169
  Tumor Volume Std. Err. Drug Regimen
0               0.329346    Capomulin
1               0.469821     Ceftamin
2               0.492236    Infubinol
3                0.60386     Ketapril
4               0.596466     Naftisol
5               0.581331      Placebo
6               0.544332     Propriva
7               0.320955     Ramicane
8               0.573111     Stelasyn
9               0.516398    Zoniferol

Capomulin's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64) 
Ramicane's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64) 
Infubinol's potential outliers: 31    36.321346
Name: Tumor Volume (mm3), dtype: float64 
Ceftamin's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64) 

The correlation between mouse weight and the average tumor volume is 0.84

Python Projects