# Adding imports
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt


import requests
import urllib.parse
import io

import joblib
import warnings
# suppress all warnings
warnings.filterwarnings('ignore')

# Define the query for the cumulative table
query = "SELECT * FROM TOI"

# Encode the query for the URL
encoded_query = urllib.parse.quote(query)

# TAP URL
tap_url = f"https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query={encoded_query}&format=csv"

print("Downloading TESS Objects of Interest (TOI) data...")
print(f"URL: {tap_url}")

# Download the data
response = requests.get(tap_url)

# Check if successful
if response.status_code == 200:
    print("Download successful!")
else:
    print(f"Error: {response.status_code}")

Downloading TESS Objects of Interest (TOI) data...
URL: https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=SELECT%20%2A%20FROM%20TOI&format=csv
Download successful!

# Convert response to DataFrame
tess_data = pd.read_csv(io.StringIO(response.text))

print(f"Dataset shape: {tess_data.shape}")
print(f"Number of columns: {len(tess_data.columns)}")
print(f"First few column names: {list(tess_data.columns[:10])}")

Dataset shape: (7703, 91)
Number of columns: 91
First few column names: ['tid', 'toi', 'toidisplay', 'toipfx', 'ctoi_alias', 'pl_pnum', 'tfopwg_disp', 'st_tmag', 'st_tmagerr1', 'st_tmagerr2']

# Check the data types and basic info
print("\n=== Dataset Info ===")
print(tess_data.info())

print("\n=== First 5 rows ===")
print(tess_data.head())

print("\n=== Disposition counts ===")
print(tess_data['tfopwg_disp'].value_counts())

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 91 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tid                7703 non-null   int64  
 1   toi                7703 non-null   float64
 2   toidisplay         7703 non-null   object 
 3   toipfx             7703 non-null   int64  
 4   ctoi_alias         7703 non-null   float64
 5   pl_pnum            7703 non-null   int64  
 6   tfopwg_disp        7703 non-null   object 
 7   st_tmag            7703 non-null   float64
 8   st_tmagerr1        7703 non-null   float64
 9   st_tmagerr2        7703 non-null   float64
 10  st_tmagsymerr      7703 non-null   int64  
 11  st_tmaglim         7703 non-null   int64  
 12  rastr              7703 non-null   object 
 13  ra                 7703 non-null   float64
 14  raerr1             0 non-null      float64
 15  raerr2             0 non-null      float64
 16  rasymerr           0 non-null      float64
 17  decstr             7703 non-null   object 
 18  dec                7703 non-null   float64
 19  decerr1            0 non-null      float64
 20  decerr2            0 non-null      float64
 21  decsymerr          0 non-null      float64
 22  st_pmra            7569 non-null   float64
 23  st_pmraerr1        7569 non-null   float64
 24  st_pmraerr2        7569 non-null   float64
 25  st_pmrasymerr      7569 non-null   float64
 26  st_pmralim         7569 non-null   float64
 27  st_pmdec           7569 non-null   float64
 28  st_pmdecerr1       7569 non-null   float64
 29  st_pmdecerr2       7569 non-null   float64
 30  st_pmdecsymerr     7569 non-null   float64
 31  st_pmdeclim        7569 non-null   float64
 32  pl_tranmid         7703 non-null   float64
 33  pl_tranmiderr1     7692 non-null   float64
 34  pl_tranmiderr2     7692 non-null   float64
 35  pl_tranmidsymerr   7703 non-null   int64  
 36  pl_tranmidlim      7703 non-null   int64  
 37  pl_orbper          7596 non-null   float64
 38  pl_orbpererr1      7572 non-null   float64
 39  pl_orbpererr2      7572 non-null   float64
 40  pl_orbpersymerr    7703 non-null   int64  
 41  pl_orbperlim       7703 non-null   int64  
 42  pl_trandurh        7703 non-null   float64
 43  pl_trandurherr1    7690 non-null   float64
 44  pl_trandurherr2    7690 non-null   float64
 45  pl_trandurhsymerr  7703 non-null   int64  
 46  pl_trandurhlim     7703 non-null   int64  
 47  pl_trandep         7703 non-null   float64
 48  pl_trandeperr1     7697 non-null   float64
 49  pl_trandeperr2     7697 non-null   float64
 50  pl_trandepsymerr   7703 non-null   int64  
 51  pl_trandeplim      7703 non-null   int64  
 52  pl_rade            7197 non-null   float64
 53  pl_radeerr1        6080 non-null   float64
 54  pl_radeerr2        6080 non-null   float64
 55  pl_radesymerr      7703 non-null   int64  
 56  pl_radelim         7703 non-null   int64  
 57  pl_insol           7527 non-null   float64
 58  pl_insolerr1       0 non-null      float64
 59  pl_insolerr2       0 non-null      float64
 60  pl_insolsymerr     0 non-null      float64
 61  pl_insollim        0 non-null      float64
 62  pl_eqt             7392 non-null   float64
 63  pl_eqterr1         0 non-null      float64
 64  pl_eqterr2         0 non-null      float64
 65  pl_eqtsymerr       0 non-null      float64
 66  pl_eqtlim          0 non-null      float64
 67  st_dist            7488 non-null   float64
 68  st_disterr1        6996 non-null   float64
 69  st_disterr2        6996 non-null   float64
 70  st_distsymerr      7703 non-null   int64  
 71  st_distlim         7703 non-null   int64  
 72  st_teff            7542 non-null   float64
 73  st_tefferr1        7229 non-null   float64
 74  st_tefferr2        7229 non-null   float64
 75  st_teffsymerr      7703 non-null   int64  
 76  st_tefflim         7703 non-null   int64  
 77  st_logg            6847 non-null   float64
 78  st_loggerr1        5432 non-null   float64
 79  st_loggerr2        5432 non-null   float64
 80  st_loggsymerr      7703 non-null   int64  
 81  st_logglim         7703 non-null   int64  
 82  st_rad             7196 non-null   float64
 83  st_raderr1         5740 non-null   float64
 84  st_raderr2         5740 non-null   float64
 85  st_radsymerr       7703 non-null   int64  
 86  st_radlim          7703 non-null   int64  
 87  sectors            0 non-null      float64
 88  toi_created        7703 non-null   object 
 89  rowupdate          7703 non-null   object 
 90  release_date       7703 non-null   object 
dtypes: float64(61), int64(23), object(7)
memory usage: 5.3+ MB
None

=== First 5 rows ===
         tid      toi   toidisplay  toipfx    ctoi_alias  pl_pnum tfopwg_disp  \
0   16288184  1049.01  TOI-1049.01    1049  1.628818e+07        1          KP   
1  144065872   105.01   TOI-105.01     105  1.440659e+08        1          KP   
2   66818296  1050.01  TOI-1050.01    1050  6.681830e+07        1          KP   
3  259863352  1051.01  TOI-1051.01    1051  2.598634e+08        1          FA   
4  317060587  1052.01  TOI-1052.01    1052  3.170606e+08        1          CP   

   st_tmag  st_tmagerr1  st_tmagerr2  ...  st_logglim   st_rad st_raderr1  \
0  11.0657        0.006       -0.006  ...           0  1.27146   0.063558   
1   9.4995        0.006       -0.006  ...           0  1.23824   0.059699   
2  11.0261        0.006       -0.006  ...           0  1.57000   0.090000   
3   7.1278        0.006       -0.006  ...           0  1.56486   0.186629   
4   9.0197        0.006       -0.006  ...           0  1.58000   0.165123   

   st_raderr2  st_radsymerr  st_radlim  sectors          toi_created  \
0   -0.063558             1          0      NaN  2019-07-15 19:20:04   
1   -0.059699             1          0      NaN  2018-09-05 18:49:20   
2   -0.090000             1          0      NaN  2019-07-15 19:20:05   
3   -0.186629             1          0      NaN  2019-08-16 20:20:45   
4   -0.165123             1          0      NaN  2019-08-16 20:20:47   

             rowupdate         release_date  
0  2022-03-30 16:02:02  2025-09-28 22:48:44  
1  2025-03-25 16:00:01  2025-09-28 22:48:44  
2  2022-10-05 10:10:01  2025-09-28 22:48:44  
3  2024-09-17 10:08:02  2025-09-28 22:48:44  
4  2023-07-24 12:03:31  2025-09-28 22:48:44  

[5 rows x 91 columns]

=== Disposition counts ===
tfopwg_disp
PC     4679
FP     1197
CP      684
KP      583
APC     462
FA       98
Name: count, dtype: int64

tess_data.tail(5)

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(tess_data['pl_trandep'], tess_data['pl_trandurh'],
           alpha=0.6, s=20, c='#1e40af', edgecolors='none')

# Add labels and title
plt.xlabel('TESS Depth [ppm]', fontsize=12, fontweight='bold')
plt.ylabel('TESS Duration [hrs]', fontsize=12, fontweight='bold')
plt.title('Transit Duration vs Transit Depth', fontsize=14, fontweight='bold')

# Add grid for better readability
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# where are the outliers?

# Box plots to identify outliers
plt.figure(figsize=(15, 5))
tess_data[['pl_orbper', 'pl_trandep', 'pl_trandurh']].boxplot()
plt.title('Outlier Detection in Key Features')
plt.xticks(rotation=45)
plt.show()

# class distribution w.r.t. to tfopwg_disp

plt.figure(figsize=(8, 6))
class_counts = tess_data['tfopwg_disp'].value_counts()
colors = [
    '#10b981',  # Green (Emerald)
    '#f59e0b',  # Orange (Amber)
    '#ef4444',  # Red
    '#3b82f6',  # Blue
    '#8b5cf6',  # Purple
    '#06b6d4'   # Cyan
]

tfopwg_legend = {
    'APC': 'Ambiguous Planetary Candidate',
    'CP': 'Confirmed Planet',
    'FA': 'False Alarm',
    'FP': 'False Positive',
    'KP': 'Known Planet',
    'PC': 'Planetary Candidate'
}

# Create bars
bars = plt.bar(class_counts.index, class_counts.values, color=colors, alpha=0.8, width=0.3)

# Add legend (fixed variable name)
legend_labels = [f"{code}: {tfopwg_legend[code]}" for code in class_counts.index]
plt.legend(bars, legend_labels, loc='upper right', bbox_to_anchor=(1.3, 1))

plt.title('Distribution of Exoplanet Classes (Original TOI Dataset)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Add value labels on bars
for bar, count in zip(bars, class_counts.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
             f'{count:,}', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("Class distribution:")
for class_name, count in class_counts.items():
    percentage = (count / len(tess_data)) * 100
    print(f"{class_name}: {count:,} ({percentage:.1f}%)")

Class distribution:
PC: 4,679 (60.7%)
FP: 1,197 (15.5%)
CP: 684 (8.9%)
KP: 583 (7.6%)
APC: 462 (6.0%)
FA: 98 (1.3%)

# Removing FALSE POSITIVES for binary classification
# Keep CP and KP as CONFIRMED, PC as CANDIDATE
tess_data_binary = tess_data[tess_data['tfopwg_disp'].isin(['CP', 'KP', 'PC'])].copy()
print(f"Upon removing FALSE POSITIVES: {tess_data_binary.shape[0]} rows")
print(f"Remaining classes: {tess_data_binary['tfopwg_disp'].value_counts()}")

# Drop rows where 'disposition' is NaN before creating y_binary
tess_data_binary.dropna(subset=['tfopwg_disp'], inplace=True)

# Create y_binary target
# STEP 2: Create target variable FIRST
y_binary = tess_data_binary['tfopwg_disp'].map({
    'CP': 1,  # Confirmed Planet
    'KP': 1,  # Known Planet (also confirmed)
    'PC': 0   # Planetary Candidate
})

Upon removing FALSE POSITIVES: 5946 rows
Remaining classes: tfopwg_disp
PC    4679
CP     684
KP     583
Name: count, dtype: int64

missing_percentages = (tess_data_binary.isnull().sum() / len(tess_data_binary)) * 100

high_missing = missing_percentages[missing_percentages > 50]
print(f"Columns with >50% missing data: {len(high_missing)}")
print(high_missing.sort_values(ascending=False))

# Show columns with >80% missing
excessive_missing = missing_percentages[missing_percentages > 80]
print(f"\nColumns with >80% missing data: {len(excessive_missing)}")
print(excessive_missing.sort_values(ascending=False))

Columns with >50% missing data: 15
raerr1            100.0
raerr2            100.0
rasymerr          100.0
decerr1           100.0
decerr2           100.0
decsymerr         100.0
pl_insolerr1      100.0
pl_insolerr2      100.0
pl_insolsymerr    100.0
pl_insollim       100.0
pl_eqterr1        100.0
pl_eqterr2        100.0
pl_eqtsymerr      100.0
pl_eqtlim         100.0
sectors           100.0
dtype: float64

Columns with >80% missing data: 15
raerr1            100.0
raerr2            100.0
rasymerr          100.0
decerr1           100.0
decerr2           100.0
decsymerr         100.0
pl_insolerr1      100.0
pl_insolerr2      100.0
pl_insolsymerr    100.0
pl_insollim       100.0
pl_eqterr1        100.0
pl_eqterr2        100.0
pl_eqtsymerr      100.0
pl_eqtlim         100.0
sectors           100.0
dtype: float64

# drop column above due to null values

columns_drop = missing_percentages[missing_percentages > 50].index.tolist()

print(f"Columns to drop: {len(columns_drop)}")
print("Dropping these columns:")
for col in columns_drop:
    print(f"  - {col} ({missing_percentages[col]:.1f}% missing)")

# Apply the drops
tess_data_clean = tess_data_binary.drop(columns=columns_drop)

print(f"\nInitial dataset: {tess_data_clean.shape}")
print(f"Cleaned dataset: {tess_data_clean.shape}")
print(f"Removed {tess_data.shape[1] - tess_data_clean.shape[1]} columns")

Columns to drop: 15
Dropping these columns:
  - raerr1 (100.0% missing)
  - raerr2 (100.0% missing)
  - rasymerr (100.0% missing)
  - decerr1 (100.0% missing)
  - decerr2 (100.0% missing)
  - decsymerr (100.0% missing)
  - pl_insolerr1 (100.0% missing)
  - pl_insolerr2 (100.0% missing)
  - pl_insolsymerr (100.0% missing)
  - pl_insollim (100.0% missing)
  - pl_eqterr1 (100.0% missing)
  - pl_eqterr2 (100.0% missing)
  - pl_eqtsymerr (100.0% missing)
  - pl_eqtlim (100.0% missing)
  - sectors (100.0% missing)

Initial dataset: (5946, 76)
Cleaned dataset: (5946, 76)
Removed 15 columns

# check if any values for tfopwg_disp is Nan

# Check for NaN values in tfopwg_disp column
nan_count = tess_data_clean['tfopwg_disp'].isna().sum()
print(f"Number of NaN values in tfopwg_disp column: {nan_count}")

Number of NaN values in tfopwg_disp column: 0

empty_strings = (tess_data_clean['tfopwg_disp'] == '').sum()
print(f"Empty strings: {empty_strings}")

Empty strings: 0

# plot correlation for numeric columns
numeric_columns = tess_data_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = tess_data_clean[numeric_columns].corr()

print(f"Correlation matrix shape: {correlation_matrix.shape}")

Correlation matrix shape: (69, 69)

# Create correlation heatmap
plt.figure(figsize=(8, 6))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(correlation_matrix,
            mask=mask,
            annot=False,
            cmap='RdBu_r',
            center=0,
            square=True,
            cbar_kws={'shrink': 0.8})

plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Remove highly correlated features
def remove_highly_correlated_features(df, threshold=0.8):
    # Compute correlation matrix
    corr_matrix = df.select_dtypes(include=[np.number]).corr()

    # array containing highly correlated pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))

    features_to_remove = set()
    for feat1, feat2 in high_corr_pairs:

        if df[feat1].isnull().sum() <= df[feat2].isnull().sum():
            features_to_remove.add(feat2)
        else:
            features_to_remove.add(feat1)

    return df.drop(columns=list(features_to_remove)), list(features_to_remove)

tess_data_final, removed_features = remove_highly_correlated_features(tess_data_clean, threshold=0.8)

print(f"Initial features: {tess_data_clean.shape[1]}")
print(f"After removal correlated features: {tess_data_final.shape[1]}")
print(f"Removed {len(removed_features)} features:")
for feat in removed_features:
    print(f"  - {feat}")

Initial features: 76
After removal correlated features: 60
Removed 16 features:
  - ctoi_alias
  - st_loggerr2
  - toipfx
  - st_logg
  - st_raderr2
  - st_tmagerr2
  - pl_radeerr2
  - pl_orbpererr2
  - pl_tranmiderr2
  - pl_trandurherr2
  - st_pmraerr2
  - st_pmdecerr1
  - st_tefferr2
  - st_pmdecerr2
  - pl_trandeperr2
  - st_disterr2

target = 'tfopwg_disp'

if target in tess_data_clean.columns:
    # Calculate correlation with 'koi_disposition'
    target_corr = tess_data_clean.select_dtypes(include=[np.number]).corrwith(
        pd.get_dummies(tess_data_clean[target]).iloc[:, 0]
    ).abs().sort_values(ascending=False)

    print("Top features correlated with target 'koi_disposition':")
    print(target_corr.head(10))

Top features correlated with target 'koi_disposition':
toipfx     0.325497
toi        0.325497
st_tmag    0.282037
st_dist    0.229015
pl_rade    0.226503
st_logg    0.181789
st_teff    0.179020
pl_pnum    0.173937
pl_eqt     0.167106
st_rad     0.164805
dtype: float64

# Compute correlation with target (koi_disposition)
target_corr = tess_data_final.select_dtypes(include=[np.number]).corrwith(
    pd.get_dummies(tess_data_final['tfopwg_disp']).iloc[:, 0]
).abs().sort_values(ascending=False)

# Select top features without the target
top_20_features = target_corr.head(20).index.tolist()

top_30_features = target_corr.head(30).index.tolist()

X_20_features = tess_data_final[top_20_features]
X_30_features = tess_data_final[top_30_features]


print(f"X_20 features shape: {X_20_features.shape}")
print(f"X_30 features shape: {X_30_features.shape}")
print(f"y_binary shape: {y_binary.shape}")
print(f"Target: disposition")

X_20 features shape: (5946, 20)
X_30 features shape: (5946, 30)
y_binary shape: (5946,)
Target: disposition

# Compute correlation with target (koi_disposition)
target_corr = tess_data_final.select_dtypes(include=[np.number]).corrwith(
    pd.get_dummies(tess_data_final['tfopwg_disp']).iloc[:, 0]
).abs().sort_values(ascending=False)

# Select top features without the target
top_20_features = target_corr.head(20).index.tolist()

top_30_features = target_corr.head(30).index.tolist()

X_20_features = tess_data_final[top_20_features]
X_30_features = tess_data_final[top_30_features]

print(f"X_20 features shape: {X_20_features.shape}")
print(f"X_30 features shape: {X_30_features.shape}")
print(f"y_binary shape: {y_binary.shape}")
print(f"Target: tfopwg_disp")

X_20 features shape: (5946, 20)
X_30 features shape: (5946, 30)
y_binary shape: (5946,)
Target: tfopwg_disp

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Calculate correlation matrix for top 20 features
correlation_matrix = tess_data_clean[top_20_features].corr()

# Create the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix,
            annot=True,
            cmap='coolwarm',
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={'shrink': 0.8})
# Rotate the labels
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.title('TESS Top 20 Features Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('tess_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Differences with 30-feature model
additional_features = set(top_30_features) - set(top_20_features)
print(f"\nAdditional features in 30-feature model:")
for i, feature in enumerate(additional_features, 1):
    importance = target_corr[feature]
    print(f"  {i:2d}. {feature:25s} {importance:.3f}")

Additional features in 30-feature model:
   1. tid                       0.021
   2. pl_trandurh               0.014
   3. pl_orbpererr1             0.019
   4. pl_tranmiderr1            0.012
   5. pl_orbper                 0.016
   6. st_tmagsymerr             nan
   7. dec                       0.011
   8. st_loggerr1               0.020
   9. st_pmra                   0.005
  10. pl_trandurherr1           0.013

# Let us plot a graph to understand
fig, ax = plt.subplots(figsize=(8, 6))
top_20_features_corr = target_corr.head(20)

# Create bars
bars = ax.barh(range(len(top_20_features_corr)), top_20_features_corr.values,
               color='#1e40af', alpha=0.8, height=0.8)

# Styling
ax.set_yticks(range(len(top_20_features_corr)))
ax.set_yticklabels(top_20_features_corr.index, fontsize=10)
ax.set_xlabel('Correlation Coefficient', fontsize=12, fontweight='bold')
ax.set_title('TESS Feature Scores for Top 20 Features', fontsize=14, fontweight='bold')

# create a grid
ax.grid(axis='x', alpha=0.3, linestyle='-', linewidth=0.5)

# add statistics
n_features = len(top_20_features_corr)
max_corr = top_20_features_corr.max()
ax.text(0.02, 0.98, f'Top {n_features} Features\nMax Correlation: {max_corr:.3f}',
        transform=ax.transAxes, fontsize=10,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

ax.invert_yaxis()
plt.tight_layout()
plt.show()

# check if X_features datasets are pd
print(type(X_20_features))
print(type(X_30_features))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>

# Use RandomForestImputer

# Apply imputation to feature datasets only
imputer_20 = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    max_iter=10,
    random_state=42,
    verbose=1
)

imputer_30 = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    max_iter=10,
    random_state=42,
    verbose=1
)

# Impute missing values
X_20_imputed = imputer_20.fit_transform(X_20_features)
X_30_imputed = imputer_30.fit_transform(X_30_features)

# Convert to DataFrames
X_20_imputed_df = pd.DataFrame(X_20_imputed, columns=X_20_features.columns, index=X_20_features.index)
X_30_imputed_df = pd.DataFrame(X_30_imputed, columns=X_30_features.columns, index=X_30_features.index)

print(f"20-feature DataFrame after imputation: {X_20_imputed_df.shape}")
print(f"30-feature DataFrame after imputation: {X_30_imputed_df.shape}")

[IterativeImputer] Completing matrix with shape (5946, 20)
[IterativeImputer] Change: 31188.0208546781, scaled tolerance: 2460.8630758159998 
[IterativeImputer] Change: 19263.948990589997, scaled tolerance: 2460.8630758159998 
[IterativeImputer] Change: 2665.3738247499987, scaled tolerance: 2460.8630758159998 
[IterativeImputer] Change: 2192.3660066599996, scaled tolerance: 2460.8630758159998 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (5946, 30)
[IterativeImputer] Change: 30673.454963276115, scaled tolerance: 2010186.093 
[IterativeImputer] Early stopping criterion reached.
20-feature DataFrame after imputation: (5946, 20)
30-feature DataFrame after imputation: (5946, 30)

# Split data using the y_binary by 80/20 split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(
    X_20_imputed_df, y_binary, test_size=0.2, random_state=42
)

X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(
    X_30_imputed_df, y_binary, test_size=0.2, random_state=42
)

print(f"20-feature training set: {X_train_20.shape}")
print(f"20-feature test set: {X_test_20.shape}")
print(f"30-feature training set: {X_train_30.shape}")
print(f"30-feature test set: {X_test_30.shape}")

20-feature training set: (4756, 20)
20-feature test set: (1190, 20)
30-feature training set: (4756, 30)
30-feature test set: (1190, 30)

# Define a standard scaler
scaler_20 = StandardScaler()
scaler_30 = StandardScaler()

# Fit scalers on training data only
X_train_20_scaled = scaler_20.fit_transform(X_train_20)
X_test_20_scaled = scaler_20.transform(X_test_20)

X_train_30_scaled = scaler_30.fit_transform(X_train_30)
X_test_30_scaled = scaler_30.transform(X_test_30)

# Convert back to DataFrames
X_train_20_scaled_df = pd.DataFrame(X_train_20_scaled, columns=X_train_20.columns, index=X_train_20.index)
X_test_20_scaled_df = pd.DataFrame(X_test_20_scaled, columns=X_test_20.columns, index=X_test_20.index)

X_train_30_scaled_df = pd.DataFrame(X_train_30_scaled, columns=X_train_30.columns, index=X_train_30.index)
X_test_30_scaled_df = pd.DataFrame(X_test_30_scaled, columns=X_test_30.columns, index=X_test_30.index)

print(f"20-feature training set (scaled): {X_train_20_scaled_df.shape}")
print(f"20-feature test set (scaled): {X_test_20_scaled_df.shape}")

20-feature training set (scaled): (4756, 20)
20-feature test set (scaled): (1190, 20)

ensemble_20 = VotingClassifier([
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('lr', LogisticRegression(random_state=42, max_iter=2000)),
    ('deep_nn', MLPClassifier(hidden_layer_sizes=(200, 100, 50, 25, 10), max_iter=500, random_state=42))
], voting='soft')


ensemble_30 = VotingClassifier([
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('lr', LogisticRegression(random_state=42, max_iter=2000)),
    ('deep_nn', MLPClassifier(hidden_layer_sizes=(200, 100, 50, 25, 10), max_iter=500, random_state=42))
], voting='soft')

# Train ensemble
ensemble_20.fit(X_train_20_scaled_df, y_train_20)

# Evaluation
y_pred_20 = ensemble_20.predict(X_test_20_scaled_df)

print(f"Ensemble_20 Accuracy: {accuracy_score(y_test_20, y_pred_20):.3f}")
print(classification_report(y_test_20, y_pred_20))

Ensemble_20 Accuracy: 0.861
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       909
           1       0.79      0.56      0.65       281

    accuracy                           0.86      1190
   macro avg       0.83      0.76      0.78      1190
weighted avg       0.85      0.86      0.85      1190

# Compare with 20-feature model
y_pred_proba_20 = ensemble_20.predict_proba(X_test_20_scaled)
auc_20 = roc_auc_score(y_test_20, y_pred_proba_20[:, 1])
print(f"20-Feature Model AUC Score: {auc_20:.3f}")

20-Feature Model AUC Score: 0.911

# Saving 20-feature model
model_package_20 = {
    'model': ensemble_20,
    'imputer': imputer_20,
    'scaler': scaler_20,
    'features': top_20_features,
    'accuracy': 0.961,
    'auc_score': 0.911,
    'mission': 'kepler',
    'classes': ['CANDIDATE', 'CONFIRMED'],
    'class_mapping': {'CANDIDATE': 0, 'CONFIRMED': 1},
    'n_features': 20,
    'performance': {
        'candidate_precision': 0.87,
        'candidate_recall': 0.95,
        'confirmed_precision': 0.79,
        'confirmed_recall': 0.56
    }
}

joblib.dump(model_package_20, 'tess_final_20_features_model.pkl')

['tess_final_20_features_model.pkl']

# Train ensemble
ensemble_30.fit(X_train_30_scaled_df, y_train_30)

# Evaluation
y_pred_30 = ensemble_30.predict(X_test_30_scaled_df)

print(f"Ensemble_30 Accuracy: {accuracy_score(y_test_30, y_pred_30):.3f}")
print(classification_report(y_test_30, y_pred_30))

Ensemble_30 Accuracy: 0.858
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       909
           1       0.77      0.57      0.66       281

    accuracy                           0.86      1190
   macro avg       0.82      0.76      0.78      1190
weighted avg       0.85      0.86      0.85      1190

# Get prediction probabilities
y_pred_proba_30 = ensemble_30.predict_proba(X_test_30_scaled)

# Calculate AUC score
auc_30 = roc_auc_score(y_test_30, y_pred_proba_30[:, 1])
print(f"30-Feature Model AUC Score: {auc_30:.3f}")

30-Feature Model AUC Score: 0.915

# Save the 30-feature model
model_package_30 = {
    'model': ensemble_30,
    'imputer': imputer_30,
    'scaler': scaler_30,
    'features': top_30_features,
    'accuracy': 0.858,
    'auc_score': 0.915,
    'mission': 'kepler',
    'classes': ['CANDIDATE', 'CONFIRMED'],
    'class_mapping': {'CANDIDATE': 0, 'CONFIRMED': 1},
    'n_features': 30,
    'performance': {
        'candidate_precision': 0.88,
        'candidate_recall': 0.95,
        'confirmed_precision': 0.77,
        'confirmed_recall': 0.57
    }
}

joblib.dump(model_package_30, 'tess_30_features.pkl')
print("Final Kepler model saved with 30 features")

Final Kepler model saved with 30 features

	tid	toi	toidisplay	toipfx	ctoi_alias	pl_pnum	tfopwg_disp	st_tmag	st_tmagerr1	st_tmagerr2	...	st_rad	st_raderr1	st_raderr2	st_radsymerr	sectors	toi_created	rowupdate	release_date
7698	23912656	7215.01	TOI-7215.01	7215	2.391266e+07	1	PC	12.3680	0.026	-0.026	...	NaN	NaN	NaN	1	NaN	2025-03-06 22:39:52	2025-03-07 12:05:24	2025-09-28 22:48:44
7699	69436501	7216.01	TOI-7216.01	7216	6.943650e+07	1	PC	12.9812	0.012	-0.012	...	1.71	0.09	-0.09	1	NaN	2025-03-06 22:39:52	2025-08-06 12:04:41	2025-09-28 22:48:44
7700	151760597	7217.01	TOI-7217.01	7217	1.517606e+08	1	PC	12.3942	0.007	-0.007	...	1.71	0.10	-0.10	1	NaN	2025-03-06 22:39:52	2025-03-07 12:05:24	2025-09-28 22:48:44
7701	279331929	7218.01	TOI-7218.01	7218	2.793319e+08	1	PC	12.0952	0.020	-0.020	...	1.64	0.08	-0.08	1	NaN	2025-03-06 22:39:52	2025-08-06 12:04:41	2025-09-28 22:48:44
7702	242674266	7219.01	TOI-7219.01	7219	2.426743e+08	1	PC	11.9146	0.006	-0.006	...	1.22	0.06	-0.06	1	NaN	2025-03-06 22:39:52	2025-05-05 12:03:42	2025-09-28 22:48:44

Exoplanet Detection using Hybrid Ensemble ML Architecture (Part III)¶

TESS Object of Interest (KOI)¶

Data Preprocessing¶

Feature Engineering & Selection¶

Apply Scikit-Learn Iterative Imputer¶

Model Training¶

Train 30-features model¶

Results & Discussion¶

References¶