Breast cancer is one of the most common cancer types among women, and early detection can be a lifesaving health concern. This study aims to explore the potential of an artificial intelligence-based approach in breast cancer diagnosis.
Ten real-valued features are computed for each cell nucleus as follows:
1. Load libraries and clean data
3.Data preprocessing and Feature Engineering
4.Grid Search Cross validation
#Connecting the google drive
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold,cross_val_score, cross_val_predict
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, precision_recall_curve, auc
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
from plotly.subplots import make_subplots
import itertools
from sklearn.linear_model import LogisticRegression
import os
from IPython.display import display
#Importing the data
Breast_Cancer_df= pd.read_csv('/content/drive/MyDrive/Education/Masters _Data_Science/SIG788_Engineering_AI_solutions/task1/Breast_Cancer_Wisconsin.csv')
Breast_Cancer_df.sample(4)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
549 | 923465 | B | 10.82 | 24.21 | 68.89 | 361.6 | 0.08192 | 0.06602 | 0.01548 | 0.00816 | ... | 31.45 | 83.9 | 505.6 | 0.1204 | 0.1633 | 0.06194 | 0.03264 | 0.3059 | 0.07626 | NaN |
495 | 914333 | B | 14.87 | 20.21 | 96.12 | 680.9 | 0.09587 | 0.08345 | 0.06824 | 0.04951 | ... | 28.48 | 103.9 | 783.6 | 0.1216 | 0.1388 | 0.17000 | 0.10170 | 0.2369 | 0.06599 | NaN |
448 | 911150 | B | 14.53 | 19.34 | 94.25 | 659.7 | 0.08388 | 0.07800 | 0.08817 | 0.02925 | ... | 28.39 | 108.1 | 830.5 | 0.1089 | 0.2649 | 0.37790 | 0.09594 | 0.2471 | 0.07463 | NaN |
64 | 85922302 | M | 12.68 | 23.84 | 82.69 | 499.0 | 0.11220 | 0.12620 | 0.11280 | 0.06873 | ... | 33.47 | 111.8 | 888.3 | 0.1851 | 0.4061 | 0.40240 | 0.17160 | 0.3383 | 0.10310 | NaN |
4 rows × 33 columns
#Checkking the Missing Values.
missing_values_count = Breast_Cancer_df.isnull().sum()
missing_values_count
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 Unnamed: 32 569 dtype: int64
Breast_Cancer_df.drop(['id','Unnamed: 32'],axis=1,inplace=True)
Breast_Cancer_df.shape
(569, 31)
Breast_Cancer_df.diagnosis.unique()
array(['M', 'B'], dtype=object)
Breast_Cancer_df.describe()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 13.010000 | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 |
50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 18.790000 | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 |
max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
8 rows × 30 columns
Each of the features appear to have outliers as shown in the boxplot and therefore needs to be addressed during data preprocessing.
Most of the features distribution appears to be Gaussian with a right skew.
col = pd.Series(Breast_Cancer_df.select_dtypes(include= ['int64', 'float64']).columns)
sns.set_theme()
for i in range(0,len(col)):
f, axes = plt.subplots(1, 2, figsize=(10, 10))
sns.boxplot(Breast_Cancer_df[col[i]], ax = axes[0])
sns.histplot(Breast_Cancer_df[col[i]], ax = axes[1])
plt.subplots_adjust(top = 1.5, right = 10, left = 8, bottom = 1)
<ipython-input-98-c526f9488c98>:5: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. f, axes = plt.subplots(1, 2, figsize=(10, 10))
# Select the subset of columns from the DataFrame
subset_df = Breast_Cancer_df.iloc[:, 5:11]
# Define colors for the box plots
#colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen', 'cyan', 'royalblue']
# Create box plots using Seaborn
plt.figure(figsize=(10, 6)) # Adjust figure size if necessary
#sns.boxplot(data=subset_df, width=0.5, palette=colors, fliersize=5)
sns.boxplot(data=subset_df, width=0.4, fliersize=5)
# Set labels and title
plt.xlabel('Variables')
plt.ylabel('Values')
plt.title('Box plots of selected variables')
# Show the plot
plt.show()
# Count the occurrences of each unique value in the 'diagnosis' column
diagnosis_counts = Breast_Cancer_df['diagnosis'].value_counts()
# Define labels and values for the pie
labels = [f"{label} ({count})" for label, count in diagnosis_counts.items()]
values = [diagnosis_counts['B'], diagnosis_counts['M']]
# Define colors for the pie chart
colors = ['lightgreen', 'red']
# Create the pie chart using Seaborn
plt.figure(figsize=(6, 6)) # Adjust figure size if necessary
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=140)
# Add a title
plt.title('Diagnosis Distribution')
# Show the plot
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
sns.pairplot(Breast_Cancer_df.iloc[:,:30],hue='diagnosis', diag_kind='hist',height=1.6)
<seaborn.axisgrid.PairGrid at 0x7dac6de4cac0>
# Visualize the pairwise relationships using pairplot
#sns.pairplot(Breast_Cancer_df.iloc[:, 1:])
sns.pairplot(Breast_Cancer_df.iloc[:,:30],hue='diagnosis', diag_kind='hist',height=1.6)
<seaborn.axisgrid.PairGrid at 0x7dac39aebd30>