Exoplanet Detection using Hybrid Ensemble ML Architecture (Part III)¶
TESS Object of Interest (KOI)¶
In this notebook we focus on the Kepler Project mission data source that can be found here.
We explore the open-source data collected by the Transiting Exoplanet Survey Satellite Project, and build a suitable ML model for exoplanet identification.
The mission is ongoing and it is currently at 7 years, 5 months, 16 days. This work is built upon previous studies you can find in the references section.
# Adding imports
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import urllib.parse
import io
import joblib
import warnings
# suppress all warnings
warnings.filterwarnings('ignore')
# Define the query for the cumulative table
query = "SELECT * FROM TOI"
# Encode the query for the URL
encoded_query = urllib.parse.quote(query)
# TAP URL
tap_url = f"https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query={encoded_query}&format=csv"
print("Downloading TESS Objects of Interest (TOI) data...")
print(f"URL: {tap_url}")
# Download the data
response = requests.get(tap_url)
# Check if successful
if response.status_code == 200:
print("Download successful!")
else:
print(f"Error: {response.status_code}")
Downloading TESS Objects of Interest (TOI) data... URL: https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=SELECT%20%2A%20FROM%20TOI&format=csv Download successful!
# Convert response to DataFrame
tess_data = pd.read_csv(io.StringIO(response.text))
print(f"Dataset shape: {tess_data.shape}")
print(f"Number of columns: {len(tess_data.columns)}")
print(f"First few column names: {list(tess_data.columns[:10])}")
Dataset shape: (7703, 91) Number of columns: 91 First few column names: ['tid', 'toi', 'toidisplay', 'toipfx', 'ctoi_alias', 'pl_pnum', 'tfopwg_disp', 'st_tmag', 'st_tmagerr1', 'st_tmagerr2']
# Check the data types and basic info
print("\n=== Dataset Info ===")
print(tess_data.info())
print("\n=== First 5 rows ===")
print(tess_data.head())
print("\n=== Disposition counts ===")
print(tess_data['tfopwg_disp'].value_counts())
=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 91 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tid 7703 non-null int64
1 toi 7703 non-null float64
2 toidisplay 7703 non-null object
3 toipfx 7703 non-null int64
4 ctoi_alias 7703 non-null float64
5 pl_pnum 7703 non-null int64
6 tfopwg_disp 7703 non-null object
7 st_tmag 7703 non-null float64
8 st_tmagerr1 7703 non-null float64
9 st_tmagerr2 7703 non-null float64
10 st_tmagsymerr 7703 non-null int64
11 st_tmaglim 7703 non-null int64
12 rastr 7703 non-null object
13 ra 7703 non-null float64
14 raerr1 0 non-null float64
15 raerr2 0 non-null float64
16 rasymerr 0 non-null float64
17 decstr 7703 non-null object
18 dec 7703 non-null float64
19 decerr1 0 non-null float64
20 decerr2 0 non-null float64
21 decsymerr 0 non-null float64
22 st_pmra 7569 non-null float64
23 st_pmraerr1 7569 non-null float64
24 st_pmraerr2 7569 non-null float64
25 st_pmrasymerr 7569 non-null float64
26 st_pmralim 7569 non-null float64
27 st_pmdec 7569 non-null float64
28 st_pmdecerr1 7569 non-null float64
29 st_pmdecerr2 7569 non-null float64
30 st_pmdecsymerr 7569 non-null float64
31 st_pmdeclim 7569 non-null float64
32 pl_tranmid 7703 non-null float64
33 pl_tranmiderr1 7692 non-null float64
34 pl_tranmiderr2 7692 non-null float64
35 pl_tranmidsymerr 7703 non-null int64
36 pl_tranmidlim 7703 non-null int64
37 pl_orbper 7596 non-null float64
38 pl_orbpererr1 7572 non-null float64
39 pl_orbpererr2 7572 non-null float64
40 pl_orbpersymerr 7703 non-null int64
41 pl_orbperlim 7703 non-null int64
42 pl_trandurh 7703 non-null float64
43 pl_trandurherr1 7690 non-null float64
44 pl_trandurherr2 7690 non-null float64
45 pl_trandurhsymerr 7703 non-null int64
46 pl_trandurhlim 7703 non-null int64
47 pl_trandep 7703 non-null float64
48 pl_trandeperr1 7697 non-null float64
49 pl_trandeperr2 7697 non-null float64
50 pl_trandepsymerr 7703 non-null int64
51 pl_trandeplim 7703 non-null int64
52 pl_rade 7197 non-null float64
53 pl_radeerr1 6080 non-null float64
54 pl_radeerr2 6080 non-null float64
55 pl_radesymerr 7703 non-null int64
56 pl_radelim 7703 non-null int64
57 pl_insol 7527 non-null float64
58 pl_insolerr1 0 non-null float64
59 pl_insolerr2 0 non-null float64
60 pl_insolsymerr 0 non-null float64
61 pl_insollim 0 non-null float64
62 pl_eqt 7392 non-null float64
63 pl_eqterr1 0 non-null float64
64 pl_eqterr2 0 non-null float64
65 pl_eqtsymerr 0 non-null float64
66 pl_eqtlim 0 non-null float64
67 st_dist 7488 non-null float64
68 st_disterr1 6996 non-null float64
69 st_disterr2 6996 non-null float64
70 st_distsymerr 7703 non-null int64
71 st_distlim 7703 non-null int64
72 st_teff 7542 non-null float64
73 st_tefferr1 7229 non-null float64
74 st_tefferr2 7229 non-null float64
75 st_teffsymerr 7703 non-null int64
76 st_tefflim 7703 non-null int64
77 st_logg 6847 non-null float64
78 st_loggerr1 5432 non-null float64
79 st_loggerr2 5432 non-null float64
80 st_loggsymerr 7703 non-null int64
81 st_logglim 7703 non-null int64
82 st_rad 7196 non-null float64
83 st_raderr1 5740 non-null float64
84 st_raderr2 5740 non-null float64
85 st_radsymerr 7703 non-null int64
86 st_radlim 7703 non-null int64
87 sectors 0 non-null float64
88 toi_created 7703 non-null object
89 rowupdate 7703 non-null object
90 release_date 7703 non-null object
dtypes: float64(61), int64(23), object(7)
memory usage: 5.3+ MB
None
=== First 5 rows ===
tid toi toidisplay toipfx ctoi_alias pl_pnum tfopwg_disp \
0 16288184 1049.01 TOI-1049.01 1049 1.628818e+07 1 KP
1 144065872 105.01 TOI-105.01 105 1.440659e+08 1 KP
2 66818296 1050.01 TOI-1050.01 1050 6.681830e+07 1 KP
3 259863352 1051.01 TOI-1051.01 1051 2.598634e+08 1 FA
4 317060587 1052.01 TOI-1052.01 1052 3.170606e+08 1 CP
st_tmag st_tmagerr1 st_tmagerr2 ... st_logglim st_rad st_raderr1 \
0 11.0657 0.006 -0.006 ... 0 1.27146 0.063558
1 9.4995 0.006 -0.006 ... 0 1.23824 0.059699
2 11.0261 0.006 -0.006 ... 0 1.57000 0.090000
3 7.1278 0.006 -0.006 ... 0 1.56486 0.186629
4 9.0197 0.006 -0.006 ... 0 1.58000 0.165123
st_raderr2 st_radsymerr st_radlim sectors toi_created \
0 -0.063558 1 0 NaN 2019-07-15 19:20:04
1 -0.059699 1 0 NaN 2018-09-05 18:49:20
2 -0.090000 1 0 NaN 2019-07-15 19:20:05
3 -0.186629 1 0 NaN 2019-08-16 20:20:45
4 -0.165123 1 0 NaN 2019-08-16 20:20:47
rowupdate release_date
0 2022-03-30 16:02:02 2025-09-28 22:48:44
1 2025-03-25 16:00:01 2025-09-28 22:48:44
2 2022-10-05 10:10:01 2025-09-28 22:48:44
3 2024-09-17 10:08:02 2025-09-28 22:48:44
4 2023-07-24 12:03:31 2025-09-28 22:48:44
[5 rows x 91 columns]
=== Disposition counts ===
tfopwg_disp
PC 4679
FP 1197
CP 684
KP 583
APC 462
FA 98
Name: count, dtype: int64
tess_data.tail(5)
| tid | toi | toidisplay | toipfx | ctoi_alias | pl_pnum | tfopwg_disp | st_tmag | st_tmagerr1 | st_tmagerr2 | ... | st_logglim | st_rad | st_raderr1 | st_raderr2 | st_radsymerr | st_radlim | sectors | toi_created | rowupdate | release_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7698 | 23912656 | 7215.01 | TOI-7215.01 | 7215 | 2.391266e+07 | 1 | PC | 12.3680 | 0.026 | -0.026 | ... | 0 | NaN | NaN | NaN | 1 | 0 | NaN | 2025-03-06 22:39:52 | 2025-03-07 12:05:24 | 2025-09-28 22:48:44 |
| 7699 | 69436501 | 7216.01 | TOI-7216.01 | 7216 | 6.943650e+07 | 1 | PC | 12.9812 | 0.012 | -0.012 | ... | 0 | 1.71 | 0.09 | -0.09 | 1 | 0 | NaN | 2025-03-06 22:39:52 | 2025-08-06 12:04:41 | 2025-09-28 22:48:44 |
| 7700 | 151760597 | 7217.01 | TOI-7217.01 | 7217 | 1.517606e+08 | 1 | PC | 12.3942 | 0.007 | -0.007 | ... | 0 | 1.71 | 0.10 | -0.10 | 1 | 0 | NaN | 2025-03-06 22:39:52 | 2025-03-07 12:05:24 | 2025-09-28 22:48:44 |
| 7701 | 279331929 | 7218.01 | TOI-7218.01 | 7218 | 2.793319e+08 | 1 | PC | 12.0952 | 0.020 | -0.020 | ... | 0 | 1.64 | 0.08 | -0.08 | 1 | 0 | NaN | 2025-03-06 22:39:52 | 2025-08-06 12:04:41 | 2025-09-28 22:48:44 |
| 7702 | 242674266 | 7219.01 | TOI-7219.01 | 7219 | 2.426743e+08 | 1 | PC | 11.9146 | 0.006 | -0.006 | ... | 0 | 1.22 | 0.06 | -0.06 | 1 | 0 | NaN | 2025-03-06 22:39:52 | 2025-05-05 12:03:42 | 2025-09-28 22:48:44 |
5 rows × 91 columns
# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(tess_data['pl_trandep'], tess_data['pl_trandurh'],
alpha=0.6, s=20, c='#1e40af', edgecolors='none')
# Add labels and title
plt.xlabel('TESS Depth [ppm]', fontsize=12, fontweight='bold')
plt.ylabel('TESS Duration [hrs]', fontsize=12, fontweight='bold')
plt.title('Transit Duration vs Transit Depth', fontsize=14, fontweight='bold')
# Add grid for better readability
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# where are the outliers?
# Box plots to identify outliers
plt.figure(figsize=(15, 5))
tess_data[['pl_orbper', 'pl_trandep', 'pl_trandurh']].boxplot()
plt.title('Outlier Detection in Key Features')
plt.xticks(rotation=45)
plt.show()
# class distribution w.r.t. to tfopwg_disp
plt.figure(figsize=(8, 6))
class_counts = tess_data['tfopwg_disp'].value_counts()
colors = [
'#10b981', # Green (Emerald)
'#f59e0b', # Orange (Amber)
'#ef4444', # Red
'#3b82f6', # Blue
'#8b5cf6', # Purple
'#06b6d4' # Cyan
]
tfopwg_legend = {
'APC': 'Ambiguous Planetary Candidate',
'CP': 'Confirmed Planet',
'FA': 'False Alarm',
'FP': 'False Positive',
'KP': 'Known Planet',
'PC': 'Planetary Candidate'
}
# Create bars
bars = plt.bar(class_counts.index, class_counts.values, color=colors, alpha=0.8, width=0.3)
# Add legend (fixed variable name)
legend_labels = [f"{code}: {tfopwg_legend[code]}" for code in class_counts.index]
plt.legend(bars, legend_labels, loc='upper right', bbox_to_anchor=(1.3, 1))
plt.title('Distribution of Exoplanet Classes (Original TOI Dataset)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
# Add value labels on bars
for bar, count in zip(bars, class_counts.values):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
f'{count:,}', ha='center', va='bottom', fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
print("Class distribution:")
for class_name, count in class_counts.items():
percentage = (count / len(tess_data)) * 100
print(f"{class_name}: {count:,} ({percentage:.1f}%)")
Class distribution: PC: 4,679 (60.7%) FP: 1,197 (15.5%) CP: 684 (8.9%) KP: 583 (7.6%) APC: 462 (6.0%) FA: 98 (1.3%)
Data Preprocessing¶
Only tfopwg_disp values corresponding to CONFIRMED FALSE POSITIVE and CANDIDATE are considered for classification.
# Removing FALSE POSITIVES for binary classification
# Keep CP and KP as CONFIRMED, PC as CANDIDATE
tess_data_binary = tess_data[tess_data['tfopwg_disp'].isin(['CP', 'KP', 'PC'])].copy()
print(f"Upon removing FALSE POSITIVES: {tess_data_binary.shape[0]} rows")
print(f"Remaining classes: {tess_data_binary['tfopwg_disp'].value_counts()}")
# Drop rows where 'disposition' is NaN before creating y_binary
tess_data_binary.dropna(subset=['tfopwg_disp'], inplace=True)
# Create y_binary target
# STEP 2: Create target variable FIRST
y_binary = tess_data_binary['tfopwg_disp'].map({
'CP': 1, # Confirmed Planet
'KP': 1, # Known Planet (also confirmed)
'PC': 0 # Planetary Candidate
})
Upon removing FALSE POSITIVES: 5946 rows Remaining classes: tfopwg_disp PC 4679 CP 684 KP 583 Name: count, dtype: int64
In the next step, null values are inspected. Columns with > 50% values missing are identified.
missing_percentages = (tess_data_binary.isnull().sum() / len(tess_data_binary)) * 100
high_missing = missing_percentages[missing_percentages > 50]
print(f"Columns with >50% missing data: {len(high_missing)}")
print(high_missing.sort_values(ascending=False))
# Show columns with >80% missing
excessive_missing = missing_percentages[missing_percentages > 80]
print(f"\nColumns with >80% missing data: {len(excessive_missing)}")
print(excessive_missing.sort_values(ascending=False))
Columns with >50% missing data: 15 raerr1 100.0 raerr2 100.0 rasymerr 100.0 decerr1 100.0 decerr2 100.0 decsymerr 100.0 pl_insolerr1 100.0 pl_insolerr2 100.0 pl_insolsymerr 100.0 pl_insollim 100.0 pl_eqterr1 100.0 pl_eqterr2 100.0 pl_eqtsymerr 100.0 pl_eqtlim 100.0 sectors 100.0 dtype: float64 Columns with >80% missing data: 15 raerr1 100.0 raerr2 100.0 rasymerr 100.0 decerr1 100.0 decerr2 100.0 decsymerr 100.0 pl_insolerr1 100.0 pl_insolerr2 100.0 pl_insolsymerr 100.0 pl_insollim 100.0 pl_eqterr1 100.0 pl_eqterr2 100.0 pl_eqtsymerr 100.0 pl_eqtlim 100.0 sectors 100.0 dtype: float64
# drop column above due to null values
columns_drop = missing_percentages[missing_percentages > 50].index.tolist()
print(f"Columns to drop: {len(columns_drop)}")
print("Dropping these columns:")
for col in columns_drop:
print(f" - {col} ({missing_percentages[col]:.1f}% missing)")
# Apply the drops
tess_data_clean = tess_data_binary.drop(columns=columns_drop)
print(f"\nInitial dataset: {tess_data_clean.shape}")
print(f"Cleaned dataset: {tess_data_clean.shape}")
print(f"Removed {tess_data.shape[1] - tess_data_clean.shape[1]} columns")
Columns to drop: 15 Dropping these columns: - raerr1 (100.0% missing) - raerr2 (100.0% missing) - rasymerr (100.0% missing) - decerr1 (100.0% missing) - decerr2 (100.0% missing) - decsymerr (100.0% missing) - pl_insolerr1 (100.0% missing) - pl_insolerr2 (100.0% missing) - pl_insolsymerr (100.0% missing) - pl_insollim (100.0% missing) - pl_eqterr1 (100.0% missing) - pl_eqterr2 (100.0% missing) - pl_eqtsymerr (100.0% missing) - pl_eqtlim (100.0% missing) - sectors (100.0% missing) Initial dataset: (5946, 76) Cleaned dataset: (5946, 76) Removed 15 columns
# check if any values for tfopwg_disp is Nan
# Check for NaN values in tfopwg_disp column
nan_count = tess_data_clean['tfopwg_disp'].isna().sum()
print(f"Number of NaN values in tfopwg_disp column: {nan_count}")
Number of NaN values in tfopwg_disp column: 0
empty_strings = (tess_data_clean['tfopwg_disp'] == '').sum()
print(f"Empty strings: {empty_strings}")
Empty strings: 0
Feature Engineering & Selection¶
Of the initial 91 columns, 15 columns were removed. To reduce dimensionality further, Pearson correlation is used.
# plot correlation for numeric columns
numeric_columns = tess_data_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = tess_data_clean[numeric_columns].corr()
print(f"Correlation matrix shape: {correlation_matrix.shape}")
Correlation matrix shape: (69, 69)
# Create correlation heatmap
plt.figure(figsize=(8, 6))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix,
mask=mask,
annot=False,
cmap='RdBu_r',
center=0,
square=True,
cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
Given the previous graph dimensionality can be reduced by firstly removikng highly correlated features, secondly by using the target variable tfopwg_disp and exatracting the top 20 or 30 features the variable is correlated with, while dropping everything else.
This will allow us to keep only the information necessary.
# Remove highly correlated features
def remove_highly_correlated_features(df, threshold=0.8):
# Compute correlation matrix
corr_matrix = df.select_dtypes(include=[np.number]).corr()
# array containing highly correlated pairs
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > threshold:
high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
features_to_remove = set()
for feat1, feat2 in high_corr_pairs:
if df[feat1].isnull().sum() <= df[feat2].isnull().sum():
features_to_remove.add(feat2)
else:
features_to_remove.add(feat1)
return df.drop(columns=list(features_to_remove)), list(features_to_remove)
tess_data_final, removed_features = remove_highly_correlated_features(tess_data_clean, threshold=0.8)
print(f"Initial features: {tess_data_clean.shape[1]}")
print(f"After removal correlated features: {tess_data_final.shape[1]}")
print(f"Removed {len(removed_features)} features:")
for feat in removed_features:
print(f" - {feat}")
Initial features: 76 After removal correlated features: 60 Removed 16 features: - ctoi_alias - st_loggerr2 - toipfx - st_logg - st_raderr2 - st_tmagerr2 - pl_radeerr2 - pl_orbpererr2 - pl_tranmiderr2 - pl_trandurherr2 - st_pmraerr2 - st_pmdecerr1 - st_tefferr2 - st_pmdecerr2 - pl_trandeperr2 - st_disterr2
In the following step tfopwg_disp target is now used to further reduce the dataframe dimensionality.
target = 'tfopwg_disp'
if target in tess_data_clean.columns:
# Calculate correlation with 'koi_disposition'
target_corr = tess_data_clean.select_dtypes(include=[np.number]).corrwith(
pd.get_dummies(tess_data_clean[target]).iloc[:, 0]
).abs().sort_values(ascending=False)
print("Top features correlated with target 'koi_disposition':")
print(target_corr.head(10))
Top features correlated with target 'koi_disposition': toipfx 0.325497 toi 0.325497 st_tmag 0.282037 st_dist 0.229015 pl_rade 0.226503 st_logg 0.181789 st_teff 0.179020 pl_pnum 0.173937 pl_eqt 0.167106 st_rad 0.164805 dtype: float64
# Compute correlation with target (koi_disposition)
target_corr = tess_data_final.select_dtypes(include=[np.number]).corrwith(
pd.get_dummies(tess_data_final['tfopwg_disp']).iloc[:, 0]
).abs().sort_values(ascending=False)
# Select top features without the target
top_20_features = target_corr.head(20).index.tolist()
top_30_features = target_corr.head(30).index.tolist()
X_20_features = tess_data_final[top_20_features]
X_30_features = tess_data_final[top_30_features]
print(f"X_20 features shape: {X_20_features.shape}")
print(f"X_30 features shape: {X_30_features.shape}")
print(f"y_binary shape: {y_binary.shape}")
print(f"Target: disposition")
X_20 features shape: (5946, 20) X_30 features shape: (5946, 30) y_binary shape: (5946,) Target: disposition
# Compute correlation with target (koi_disposition)
target_corr = tess_data_final.select_dtypes(include=[np.number]).corrwith(
pd.get_dummies(tess_data_final['tfopwg_disp']).iloc[:, 0]
).abs().sort_values(ascending=False)
# Select top features without the target
top_20_features = target_corr.head(20).index.tolist()
top_30_features = target_corr.head(30).index.tolist()
X_20_features = tess_data_final[top_20_features]
X_30_features = tess_data_final[top_30_features]
print(f"X_20 features shape: {X_20_features.shape}")
print(f"X_30 features shape: {X_30_features.shape}")
print(f"y_binary shape: {y_binary.shape}")
print(f"Target: tfopwg_disp")
X_20 features shape: (5946, 20) X_30 features shape: (5946, 30) y_binary shape: (5946,) Target: tfopwg_disp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Calculate correlation matrix for top 20 features
correlation_matrix = tess_data_clean[top_20_features].corr()
# Create the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix,
annot=True,
cmap='coolwarm',
center=0,
square=True,
fmt='.2f',
cbar_kws={'shrink': 0.8})
# Rotate the labels
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('TESS Top 20 Features Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('tess_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
# Differences with 30-feature model
additional_features = set(top_30_features) - set(top_20_features)
print(f"\nAdditional features in 30-feature model:")
for i, feature in enumerate(additional_features, 1):
importance = target_corr[feature]
print(f" {i:2d}. {feature:25s} {importance:.3f}")
Additional features in 30-feature model: 1. tid 0.021 2. pl_trandurh 0.014 3. pl_orbpererr1 0.019 4. pl_tranmiderr1 0.012 5. pl_orbper 0.016 6. st_tmagsymerr nan 7. dec 0.011 8. st_loggerr1 0.020 9. st_pmra 0.005 10. pl_trandurherr1 0.013
# Let us plot a graph to understand
fig, ax = plt.subplots(figsize=(8, 6))
top_20_features_corr = target_corr.head(20)
# Create bars
bars = ax.barh(range(len(top_20_features_corr)), top_20_features_corr.values,
color='#1e40af', alpha=0.8, height=0.8)
# Styling
ax.set_yticks(range(len(top_20_features_corr)))
ax.set_yticklabels(top_20_features_corr.index, fontsize=10)
ax.set_xlabel('Correlation Coefficient', fontsize=12, fontweight='bold')
ax.set_title('TESS Feature Scores for Top 20 Features', fontsize=14, fontweight='bold')
# create a grid
ax.grid(axis='x', alpha=0.3, linestyle='-', linewidth=0.5)
# add statistics
n_features = len(top_20_features_corr)
max_corr = top_20_features_corr.max()
ax.text(0.02, 0.98, f'Top {n_features} Features\nMax Correlation: {max_corr:.3f}',
transform=ax.transAxes, fontsize=10,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
ax.invert_yaxis()
plt.tight_layout()
plt.show()
Apply Scikit-Learn Iterative Imputer¶
Dispite the dimensionality is reduce there exists still null entries in the dataset. Dropping the column containing null values can cause loss of information effecting the accuracy of the model. To address this problem scikit-learn iterative imputater is used as per [1]
The iterative imputer computes p(x|y) with x being the feature with missing values and y the features containing velues.
# check if X_features datasets are pd
print(type(X_20_features))
print(type(X_30_features))
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
# Use RandomForestImputer
# Apply imputation to feature datasets only
imputer_20 = IterativeImputer(
estimator=RandomForestRegressor(n_estimators=10, random_state=42),
max_iter=10,
random_state=42,
verbose=1
)
imputer_30 = IterativeImputer(
estimator=RandomForestRegressor(n_estimators=10, random_state=42),
max_iter=10,
random_state=42,
verbose=1
)
# Impute missing values
X_20_imputed = imputer_20.fit_transform(X_20_features)
X_30_imputed = imputer_30.fit_transform(X_30_features)
# Convert to DataFrames
X_20_imputed_df = pd.DataFrame(X_20_imputed, columns=X_20_features.columns, index=X_20_features.index)
X_30_imputed_df = pd.DataFrame(X_30_imputed, columns=X_30_features.columns, index=X_30_features.index)
print(f"20-feature DataFrame after imputation: {X_20_imputed_df.shape}")
print(f"30-feature DataFrame after imputation: {X_30_imputed_df.shape}")
[IterativeImputer] Completing matrix with shape (5946, 20) [IterativeImputer] Change: 31188.0208546781, scaled tolerance: 2460.8630758159998 [IterativeImputer] Change: 19263.948990589997, scaled tolerance: 2460.8630758159998 [IterativeImputer] Change: 2665.3738247499987, scaled tolerance: 2460.8630758159998 [IterativeImputer] Change: 2192.3660066599996, scaled tolerance: 2460.8630758159998 [IterativeImputer] Early stopping criterion reached. [IterativeImputer] Completing matrix with shape (5946, 30) [IterativeImputer] Change: 30673.454963276115, scaled tolerance: 2010186.093 [IterativeImputer] Early stopping criterion reached. 20-feature DataFrame after imputation: (5946, 20) 30-feature DataFrame after imputation: (5946, 30)
Model Training¶
For the model training, the dataset is split into training and testing dataset. The ratio is 80/20. Subsequently StandardScaler is applied to the split data to avoid any data leakege of the test dataset into the model.
# Split data using the y_binary by 80/20 split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(
X_20_imputed_df, y_binary, test_size=0.2, random_state=42
)
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(
X_30_imputed_df, y_binary, test_size=0.2, random_state=42
)
print(f"20-feature training set: {X_train_20.shape}")
print(f"20-feature test set: {X_test_20.shape}")
print(f"30-feature training set: {X_train_30.shape}")
print(f"30-feature test set: {X_test_30.shape}")
20-feature training set: (4756, 20) 20-feature test set: (1190, 20) 30-feature training set: (4756, 30) 30-feature test set: (1190, 30)
# Define a standard scaler
scaler_20 = StandardScaler()
scaler_30 = StandardScaler()
# Fit scalers on training data only
X_train_20_scaled = scaler_20.fit_transform(X_train_20)
X_test_20_scaled = scaler_20.transform(X_test_20)
X_train_30_scaled = scaler_30.fit_transform(X_train_30)
X_test_30_scaled = scaler_30.transform(X_test_30)
# Convert back to DataFrames
X_train_20_scaled_df = pd.DataFrame(X_train_20_scaled, columns=X_train_20.columns, index=X_train_20.index)
X_test_20_scaled_df = pd.DataFrame(X_test_20_scaled, columns=X_test_20.columns, index=X_test_20.index)
X_train_30_scaled_df = pd.DataFrame(X_train_30_scaled, columns=X_train_30.columns, index=X_train_30.index)
X_test_30_scaled_df = pd.DataFrame(X_test_30_scaled, columns=X_test_30.columns, index=X_test_30.index)
print(f"20-feature training set (scaled): {X_train_20_scaled_df.shape}")
print(f"20-feature test set (scaled): {X_test_20_scaled_df.shape}")
20-feature training set (scaled): (4756, 20) 20-feature test set (scaled): (1190, 20)
Now the ensemble for the datafrane are created. Note that the maximum iteration is setto 500, after experimenting it was noted using 1000 provides negligible improvement to the model accurancy whilest increasing the training time.
ensemble_20 = VotingClassifier([
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('xgb', XGBClassifier(random_state=42)),
('svm', SVC(probability=True, random_state=42)),
('lr', LogisticRegression(random_state=42, max_iter=2000)),
('deep_nn', MLPClassifier(hidden_layer_sizes=(200, 100, 50, 25, 10), max_iter=500, random_state=42))
], voting='soft')
ensemble_30 = VotingClassifier([
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('xgb', XGBClassifier(random_state=42)),
('svm', SVC(probability=True, random_state=42)),
('lr', LogisticRegression(random_state=42, max_iter=2000)),
('deep_nn', MLPClassifier(hidden_layer_sizes=(200, 100, 50, 25, 10), max_iter=500, random_state=42))
], voting='soft')
# Train ensemble
ensemble_20.fit(X_train_20_scaled_df, y_train_20)
# Evaluation
y_pred_20 = ensemble_20.predict(X_test_20_scaled_df)
print(f"Ensemble_20 Accuracy: {accuracy_score(y_test_20, y_pred_20):.3f}")
print(classification_report(y_test_20, y_pred_20))
Ensemble_20 Accuracy: 0.861
precision recall f1-score support
0 0.87 0.95 0.91 909
1 0.79 0.56 0.65 281
accuracy 0.86 1190
macro avg 0.83 0.76 0.78 1190
weighted avg 0.85 0.86 0.85 1190
# Compare with 20-feature model
y_pred_proba_20 = ensemble_20.predict_proba(X_test_20_scaled)
auc_20 = roc_auc_score(y_test_20, y_pred_proba_20[:, 1])
print(f"20-Feature Model AUC Score: {auc_20:.3f}")
20-Feature Model AUC Score: 0.911
# Saving 20-feature model
model_package_20 = {
'model': ensemble_20,
'imputer': imputer_20,
'scaler': scaler_20,
'features': top_20_features,
'accuracy': 0.961,
'auc_score': 0.911,
'mission': 'kepler',
'classes': ['CANDIDATE', 'CONFIRMED'],
'class_mapping': {'CANDIDATE': 0, 'CONFIRMED': 1},
'n_features': 20,
'performance': {
'candidate_precision': 0.87,
'candidate_recall': 0.95,
'confirmed_precision': 0.79,
'confirmed_recall': 0.56
}
}
joblib.dump(model_package_20, 'tess_final_20_features_model.pkl')
['tess_final_20_features_model.pkl']
Train 30-features model¶
# Train ensemble
ensemble_30.fit(X_train_30_scaled_df, y_train_30)
# Evaluation
y_pred_30 = ensemble_30.predict(X_test_30_scaled_df)
print(f"Ensemble_30 Accuracy: {accuracy_score(y_test_30, y_pred_30):.3f}")
print(classification_report(y_test_30, y_pred_30))
Ensemble_30 Accuracy: 0.858
precision recall f1-score support
0 0.88 0.95 0.91 909
1 0.77 0.57 0.66 281
accuracy 0.86 1190
macro avg 0.82 0.76 0.78 1190
weighted avg 0.85 0.86 0.85 1190
# Get prediction probabilities
y_pred_proba_30 = ensemble_30.predict_proba(X_test_30_scaled)
# Calculate AUC score
auc_30 = roc_auc_score(y_test_30, y_pred_proba_30[:, 1])
print(f"30-Feature Model AUC Score: {auc_30:.3f}")
30-Feature Model AUC Score: 0.915
Note that using 30 features improve the ensamble accuracy across all metrics. Now both models are saved using .pkl
# Save the 30-feature model
model_package_30 = {
'model': ensemble_30,
'imputer': imputer_30,
'scaler': scaler_30,
'features': top_30_features,
'accuracy': 0.858,
'auc_score': 0.915,
'mission': 'kepler',
'classes': ['CANDIDATE', 'CONFIRMED'],
'class_mapping': {'CANDIDATE': 0, 'CONFIRMED': 1},
'n_features': 30,
'performance': {
'candidate_precision': 0.88,
'candidate_recall': 0.95,
'confirmed_precision': 0.77,
'confirmed_recall': 0.57
}
}
joblib.dump(model_package_30, 'tess_30_features.pkl')
print("Final Kepler model saved with 30 features")
Final Kepler model saved with 30 features
Results & Discussion¶
The Hybrid ML Ensemble Architecture does not provides improved performance for the 30 top features comapred to Top-20 features model.
References¶
Saha, R 2021. Comparing Classification Models on Kepler Data. arXiv preprint arXiv:2101.01904 [astro-ph.EP], viewed 5/10/2025, https://arxiv.org/abs/2101.01904.
Luz, T. S. F., Braga, R. A. S., & Ribeiro, E. R. (2024). Assessment of Ensemble-Based Machine Learning Algorithms for Exoplanet Identification. Electronics, 13(19), 3950. https://doi.org/10.3390/electronics13193950