{"id":6235,"date":"2026-04-15T21:44:40","date_gmt":"2026-04-16T02:44:40","guid":{"rendered":"https:\/\/ykim.synology.me\/wordpress\/?p=6235"},"modified":"2026-04-16T11:52:29","modified_gmt":"2026-04-16T16:52:29","slug":"addressing-random-seed-sensitivity-in-feature-selection-a-survey-of-methods-and-recent-advances-2025-2026","status":"publish","type":"post","link":"https:\/\/ykim.synology.me\/wordpress\/addressing-random-seed-sensitivity-in-feature-selection-a-survey-of-methods-and-recent-advances-2025-2026-6235\/","title":{"rendered":"Addressing Random Seed Sensitivity in Feature Selection: A Survey of Methods and Recent Advances (2025\u20132026)"},"content":{"rendered":"<p>\r\n    <style>\r\n    .k-page-nav { margin-bottom:20px; padding:10px 0; }\r\n    .k-page-nav a, .k-page-nav span {\r\n        display:block; padding:6px 10px; margin-bottom:6px;\r\n        background:#eee; border-radius:4px; text-decoration:none;\r\n        color:#333; font-weight:500;\r\n    }\r\n    .k-page-nav span { background:#333; color:#fff; }\r\n    <\/style>\r\n\r\n    <div class=\"k-page-nav\">\r\n                                    <span>Random Seed Sensitivity in Feature Selection \u2014 Page 1<\/span>\r\n                                                <a href=\"https:\/\/ykim.synology.me\/wordpress\/addressing-random-seed-sensitivity-in-feature-selection-a-survey-of-methods-and-recent-advances-2025-2026-6235\/2\/\" class=\"post-page-numbers\">                    Stability Selection with Python Code \u2014 Page 2                <\/a>\r\n                                                <a href=\"https:\/\/ykim.synology.me\/wordpress\/addressing-random-seed-sensitivity-in-feature-selection-a-survey-of-methods-and-recent-advances-2025-2026-6235\/3\/\" class=\"post-page-numbers\">                    Ensemble Importance Averaging with Python Code \u2014 Page 3                <\/a>\r\n                        <\/div>\r\n\r\n    <\/p>\n\n\n<style>.kadence-column6235_e0453a-34 > .kt-inside-inner-col{display:flex;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col,.kadence-column6235_e0453a-34 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6235_e0453a-34 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;align-items:flex-end;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col > *, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}.kt-row-column-wrap > .kadence-column6235_e0453a-34{align-self:flex-end;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34{align-self:auto;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34 > .kt-inside-inner-col{align-items:flex-end;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6235_e0453a-34{position:relative;}@media all and (max-width: 1024px){.kt-row-column-wrap > .kadence-column6235_e0453a-34{align-self:flex-end;}}@media all and (max-width: 1024px){.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34{align-self:auto;}}@media all and (max-width: 1024px){.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34 > .kt-inside-inner-col{align-items:flex-end;}}@media all and (max-width: 1024px){.kadence-column6235_e0453a-34 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;align-items:flex-end;}}@media all and (min-width: 768px) and (max-width: 1024px){.kadence-column6235_e0453a-34 > .kt-inside-inner-col > *, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}}@media all and (max-width: 767px){.kt-row-column-wrap > .kadence-column6235_e0453a-34{align-self:flex-end;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34{align-self:auto;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6235_e0453a-34 > .kt-inside-inner-col{align-items:flex-end;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;justify-content:flex-start;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col > *, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6235_e0453a-34 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6235_e0453a-34 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6235_e0453a-34 kb-section-dir-horizontal\"><div class=\"kt-inside-inner-col\">\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"1024\" src=\"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds.jpg\" alt=\"\" class=\"wp-image-6245\" style=\"width:600px\" srcset=\"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds.jpg 1024w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds-300x300.jpg 300w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds-150x150.jpg 150w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds-768x768.jpg 768w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\" style=\"font-size:8px\"><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-theme-palette-6-color\">Delicate dandelion seed heads<\/mark><\/p>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Random Seed Sensitivity in Feature Selection<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">1. Problem Statement<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Feature selection results can vary significantly depending on the random seed used during model training, data splitting, or stochastic optimization. This instability undermines reproducibility and trustworthiness, particularly in high-stakes domains such as healthcare and genomics where consistent feature identification is critical for scientific validity.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2. Classical Approaches<\/h3>\n\n\n\n<figure style=\"padding-right:var(--wp--preset--spacing--40);padding-left:var(--wp--preset--spacing--40)\" class=\"wp-block-table\"><table><thead><tr><th>Method<\/th><th>Core Idea<\/th><th>Strength<\/th><\/tr><\/thead><tbody><tr><td><strong>Stability Selection<\/strong> (Meinshausen &amp; B\u00fchlmann, 2010)<\/td><td>Repeated subsampling + selection frequency thresholding<\/td><td>Theoretical FDR control<\/td><\/tr><tr><td><strong>Ensemble Importance Averaging<\/strong><\/td><td>Aggregate feature importances across multiple seeds\/models<\/td><td>Simple, model-agnostic<\/td><\/tr><tr><td><strong>Permutation Importance<\/strong><\/td><td>Measure performance drop when a feature is shuffled<\/td><td>Model-agnostic; stable with repetition<\/td><\/tr><tr><td><strong>Boruta Algorithm<\/strong><\/td><td>Compare features against shuffled &#8220;shadow&#8221; copies via statistical test<\/td><td>Built-in significance testing<\/td><\/tr><tr><td><strong>SHAP-based Selection<\/strong><\/td><td>Rank features by Shapley values averaged over runs<\/td><td>Game-theoretic foundation; interpretable<\/td><\/tr><tr><td><strong>Knockoff Filter<\/strong> (Barber &amp; Cand\u00e8s, 2015)<\/td><td>Generate synthetic knockoff variables independent of target; compare importances<\/td><td>Explicit FDR control<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Common principle:<\/strong> Repeat \u2192 Aggregate \u2192 Threshold. Running feature selection N times (typically 50\u2013100) with different seeds and retaining features selected above a frequency threshold (e.g., \u226580%) is the most practical stabilization strategy regardless of the base method.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3. Recent Advances (2025\u20132026)<\/h3>\n\n\n<style>.kadence-column6235_078e20-d1 > .kt-inside-inner-col{padding-right:var(--global-kb-spacing-xl, 4rem);padding-left:var(--global-kb-spacing-sm, 1.5rem);}.kadence-column6235_078e20-d1 > .kt-inside-inner-col,.kadence-column6235_078e20-d1 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6235_078e20-d1 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6235_078e20-d1 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6235_078e20-d1 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6235_078e20-d1 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6235_078e20-d1{position:relative;}@media all and (max-width: 1024px){.kadence-column6235_078e20-d1 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6235_078e20-d1 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6235_078e20-d1\"><div class=\"kt-inside-inner-col\">\n<h4 class=\"wp-block-heading\">3.1 MVFS-SHAP: Contribution-Driven Voting (2026)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Method:<\/strong> Bootstrap + 5-fold CV \u2192 apply base selector on each resample \u2192 majority voting \u2192 re-rank by SHAP contribution scores (Ridge + Linear SHAP).<\/li>\n\n\n\n<li><strong>Key innovation:<\/strong> Moves beyond pure frequency-based aggregation to <em>contribution-driven aggregation<\/em>, jointly considering how often and how much a feature contributes.<\/li>\n\n\n\n<li><strong>Results:<\/strong> Stability &gt; 0.90 on two metabolomics datasets; ~80% of experiments above 0.80.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/www.sciencedirect.com\/science\/article\/abs\/pii\/S0169260725005863\" target=\"_blank\" rel=\"noopener\">ScienceDirect<\/a><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">3.2 Loss-Guided Stability Selection (2025)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Problem solved:<\/strong> Original stability selection tends to severely underfit on noisy high-dimensional data.<\/li>\n\n\n\n<li><strong>Method:<\/strong> Select the stability threshold in a <em>data-driven<\/em> manner by optimizing out-of-sample validation loss, optionally with exhaustive search.<\/li>\n\n\n\n<li><strong>Results:<\/strong> Significant precision improvement over raw Boosting while avoiding underfitting.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/link.springer.com\/article\/10.1007\/s11634-023-00573-3\" target=\"_blank\" rel=\"noopener\">Springer ADAC<\/a><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">3.3 GRIP2: Robust Deep Knockoff (2026)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Problem solved:<\/strong> Existing deep knockoff methods degrade under high feature correlation and low SNR.<\/li>\n\n\n\n<li><strong>Method:<\/strong> Improved deep generative knockoff framework with enhanced robustness guarantees.<\/li>\n\n\n\n<li><strong>Results:<\/strong> Superior FDR-controlled power vs. linear baselines; validated on HIV drug resistance mutations.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/arxiv.org\/abs\/2602.00218\" target=\"_blank\" rel=\"noopener\">arXiv 2602.00218<\/a><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">3.4 Knockoff-ML: Clinical Risk Feature Selection (2025)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Method:<\/strong> Model-free integration of knockoff framework with ML algorithms + SHAP for interpretability.<\/li>\n\n\n\n<li><strong>Strength:<\/strong> Handles nonlinear feature\u2013outcome relationships with FDR control; applied to EHR data.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/www.nature.com\/articles\/s41746-025-02102-2\" target=\"_blank\" rel=\"noopener\">Nature npj Digital Medicine<\/a><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">3.5 OSSFS: Stable Streaming Feature Selection (2025)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Problem solved:<\/strong> Streaming features arrive continuously; existing methods ignore stability.<\/li>\n\n\n\n<li><strong>Method:<\/strong> MeanShift-inspired incremental aggregation into hyperellipsoids; select representative features per cluster.<\/li>\n\n\n\n<li><strong>Results:<\/strong> Optimal stability without sacrificing predictive accuracy on real-world datasets.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/dl.acm.org\/doi\/10.1145\/3715918\" target=\"_blank\" rel=\"noopener\">ACM TKDD<\/a><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">3.6 Two-Stage RF + Improved GA (2025)<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Method:<\/strong> Stage 1 \u2014 RF importance ranking; Stage 2 \u2014 improved genetic algorithm for global optimal subset search.<\/li>\n\n\n\n<li><strong>Strength:<\/strong> RF ensemble nature reduces seed dependency; GA explores search space systematically.<\/li>\n\n\n\n<li><strong>Ref:<\/strong> <a href=\"https:\/\/www.nature.com\/articles\/s41598-025-01761-1\" target=\"_blank\" rel=\"noopener\">Nature Scientific Reports<\/a><\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h3 class=\"wp-block-heading\">4. Summary of 2025\u20132026 Trends<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Two dominant research directions have emerged:<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>Interpretability-integrated aggregation<\/strong> \u2014 Combining repeated sampling with explainability metrics (especially SHAP) during the aggregation step, moving from &#8220;how often selected&#8221; to &#8220;how often selected AND how much it contributes&#8221; (e.g., MVFS-SHAP).<\/li>\n\n\n\n<li><strong>Deep knockoff extensions<\/strong> \u2014 Scaling the knockoff framework with deep generative models to handle nonlinear relationships, high correlation, and low-SNR regimes while maintaining rigorous FDR control (e.g., GRIP2, Knockoff-ML).<\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\">5. Practical Recommendations<\/h3>\n\n\n\n<ol class=\"wp-block-list\">\n<li><mark style=\"background-color:rgba(0, 0, 0, 0);color:#1e90ff\" class=\"has-inline-color\"><strong>Always aggregate:<\/strong> Run feature selection with K different seeds (K \u2265 50) and retain features exceeding a stability threshold.<\/mark><\/li>\n\n\n\n<li><strong>Use SHAP-weighted voting<\/strong> over simple frequency counting for richer importance signals.<\/li>\n\n\n\n<li><strong>Apply knockoff methods<\/strong> when FDR control is a hard requirement (e.g., biomedical studies).<\/li>\n\n\n\n<li><strong>Increase ensemble size:<\/strong> For tree-based models, larger <code>n_estimators<\/code> naturally reduces seed sensitivity.<\/li>\n\n\n\n<li><strong>Report stability metrics:<\/strong> Use Jaccard index, Kuncheva index, or selection probability distributions alongside accuracy to quantify reproducibility.<\/li>\n<\/ol>\n\n\n\n<h2 class=\"wp-block-heading\">\ud83c\udfc4 FDR (False Discovery Rate)<\/h2>\n\n\n<style>.kadence-column6235_db09eb-80 > .kt-inside-inner-col{padding-right:var(--global-kb-spacing-xl, 4rem);padding-left:var(--global-kb-spacing-sm, 1.5rem);}.kadence-column6235_db09eb-80 > .kt-inside-inner-col,.kadence-column6235_db09eb-80 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6235_db09eb-80 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6235_db09eb-80 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6235_db09eb-80 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6235_db09eb-80 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6235_db09eb-80{position:relative;}@media all and (max-width: 1024px){.kadence-column6235_db09eb-80 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6235_db09eb-80 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6235_db09eb-80\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\">FDR stands for <strong>False Discovery Rate<\/strong>, an error rate metric used in multiple hypothesis testing.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Definition<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">FDR is the <strong>expected proportion<\/strong> of false positives among all items declared as &#8220;significant&#8221; (discoveries).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">$$<br>FDR = E\\left[\\frac{V}{R}\\right]<br>$$<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>V<\/strong>: Number of incorrectly selected features (false positives)<\/li>\n\n\n\n<li><strong>R<\/strong>: Total number of selected features (total discoveries)<\/li>\n\n\n\n<li>Defined as 0 when R = 0<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">Meaning in Feature Selection<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">For example, controlling the FDR at <strong>0.1<\/strong> guarantees that, on average, <strong>no more than 10% of the finally selected features are irrelevant (false discoveries)<\/strong>.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">If 100 features are selected:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>On average, about <strong>10 features<\/strong> may actually be noise unrelated to the target<\/li>\n\n\n\n<li>The remaining <strong>90 features<\/strong> are statistically guaranteed to be genuinely important<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<!--nextpage-->\n\n\n\n<h1 class=\"wp-block-heading\">Stability Selection with Python Code<\/h1>\n\n\n\n<h2 class=\"wp-block-heading\">Overview<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Stability Selection<\/strong> (Meinshausen &amp; B\u00fchlmann, 2010) is a general technique that improves the reliability of feature selection by combining <strong>subsampling<\/strong> with any base selection algorithm (e.g., Lasso, Random Forest).<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Core Idea<\/h2>\n\n\n\n<ol class=\"wp-block-list\">\n<li>Repeatedly draw random subsamples (typically half of the data) \u2014 B times (e.g., B = 100).<\/li>\n\n\n\n<li>Apply a base feature selector on each subsample.<\/li>\n\n\n\n<li>For each feature, compute its <strong>selection probability<\/strong> = (# times selected) \/ B.<\/li>\n\n\n\n<li>Retain only features whose selection probability exceeds a threshold \u03c0 (e.g., 0.6\u20130.9).<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">This filters out features that appear significant only due to random fluctuations, yielding a stable and reproducible feature set with theoretical false discovery control.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Python Example (Lasso-based)<\/h2>\n\n\n\n<div class=\"wp-block-kevinbatdorf-code-block-pro cbp-has-line-numbers\" data-code-block-pro-font-family=\"Code-Pro-JetBrains-Mono\" style=\"font-size:1rem;font-family:Code-Pro-JetBrains-Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;--cbp-line-number-color:#24292e;--cbp-line-number-width:calc(2 * 0.6 * 1rem);line-height:1.625rem;--cbp-tab-width:2;tab-size:var(--cbp-tab-width, 2)\"><span role=\"button\" tabindex=\"0\" style=\"color:#24292e;display:none\" aria-label=\"Copy\" class=\"code-block-pro-copy-button\"><pre class=\"code-block-pro-copy-button-pre\" aria-hidden=\"true\"><textarea class=\"code-block-pro-copy-button-textarea\" tabindex=\"-1\" aria-hidden=\"true\" readonly>import numpy as np\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso\nfrom sklearn.preprocessing import StandardScaler\n\n# 1. Generate synthetic data: 200 samples, 50 features, only 10 informative\nX, y, true_coef = make_regression(\n    n_samples=200, n_features=50, n_informative=10,\n    noise=5.0, coef=True, random_state=0\n)\nX = StandardScaler().fit_transform(X)\n\n# 2. Stability Selection parameters\nn_bootstrap = 100          # number of subsamples (B)\nsubsample_ratio = 0.5      # fraction of data per subsample\nalpha = 0.05               # Lasso regularization strength\nthreshold = 0.7            # selection probability cutoff (\u03c0)\n\nn_samples, n_features = X.shape\nselection_counts = np.zeros(n_features)\n\n# 3. Repeated subsampling + Lasso selection\nrng = np.random.default_rng(42)\nsub_size = int(n_samples * subsample_ratio)\n\nfor _ in range(n_bootstrap):\n    idx = rng.choice(n_samples, size=sub_size, replace=False)\n    lasso = Lasso(alpha=alpha, max_iter=10000).fit(X&#091;idx&#093;, y&#091;idx&#093;)\n    selection_counts += (lasso.coef_ != 0).astype(int)\n\n# 4. Compute selection probabilities and apply threshold\nselection_prob = selection_counts \/ n_bootstrap\nstable_features = np.where(selection_prob >= threshold)&#091;0&#093;\n\n# 5. Report\nprint(f\"Stable features (prob \u2265 {threshold}): {stable_features}\")\nprint(f\"Selection probabilities: {np.round(selection_prob, 2)}\")\nprint(f\"True informative features: {np.where(true_coef != 0)&#091;0&#093;}\")<\/textarea><\/pre><svg xmlns=\"http:\/\/www.w3.org\/2000\/svg\" style=\"width:24px;height:24px\" fill=\"none\" viewBox=\"0 0 24 24\" stroke=\"currentColor\" stroke-width=\"2\"><path class=\"with-check\" stroke-linecap=\"round\" stroke-linejoin=\"round\" d=\"M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2m-6 9l2 2 4-4\"><\/path><path class=\"without-check\" stroke-linecap=\"round\" stroke-linejoin=\"round\" d=\"M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2\"><\/path><\/svg><\/span><pre class=\"shiki github-light\" style=\"background-color: #fff\" tabindex=\"0\"><code><span class=\"line\"><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> numpy <\/span><span style=\"color: #D73A49\">as<\/span><span style=\"color: #24292E\"> np<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">from<\/span><span style=\"color: #24292E\"> sklearn.datasets <\/span><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> make_regression<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">from<\/span><span style=\"color: #24292E\"> sklearn.linear_model <\/span><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> Lasso<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">from<\/span><span style=\"color: #24292E\"> sklearn.preprocessing <\/span><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> StandardScaler<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 1. Generate synthetic data: 200 samples, 50 features, only 10 informative<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">X, y, true_coef <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> make_regression(<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #E36209\">n_samples<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">200<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">n_features<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">50<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">n_informative<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">10<\/span><span style=\"color: #24292E\">,<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #E36209\">noise<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">5.0<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">coef<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">True<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">random_state<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">0<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">X <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> StandardScaler().fit_transform(X)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 2. Stability Selection parameters<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">n_bootstrap <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">100<\/span><span style=\"color: #24292E\">          <\/span><span style=\"color: #6A737D\"># number of subsamples (B)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">subsample_ratio <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">0.5<\/span><span style=\"color: #24292E\">      <\/span><span style=\"color: #6A737D\"># fraction of data per subsample<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">alpha <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">0.05<\/span><span style=\"color: #24292E\">               <\/span><span style=\"color: #6A737D\"># Lasso regularization strength<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">threshold <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">0.7<\/span><span style=\"color: #24292E\">            <\/span><span style=\"color: #6A737D\"># selection probability cutoff (\u03c0)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">n_samples, n_features <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> X.shape<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">selection_counts <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> np.zeros(n_features)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 3. Repeated subsampling + Lasso selection<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">rng <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> np.random.default_rng(<\/span><span style=\"color: #005CC5\">42<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">sub_size <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">int<\/span><span style=\"color: #24292E\">(n_samples <\/span><span style=\"color: #D73A49\">*<\/span><span style=\"color: #24292E\"> subsample_ratio)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">for<\/span><span style=\"color: #24292E\"> _ <\/span><span style=\"color: #D73A49\">in<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">range<\/span><span style=\"color: #24292E\">(n_bootstrap):<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    idx <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> rng.choice(n_samples, <\/span><span style=\"color: #E36209\">size<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\">sub_size, <\/span><span style=\"color: #E36209\">replace<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">False<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    lasso <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> Lasso(<\/span><span style=\"color: #E36209\">alpha<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\">alpha, <\/span><span style=\"color: #E36209\">max_iter<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">10000<\/span><span style=\"color: #24292E\">).fit(X&#091;idx&#093;, y&#091;idx&#093;)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    selection_counts <\/span><span style=\"color: #D73A49\">+=<\/span><span style=\"color: #24292E\"> (lasso.coef_ <\/span><span style=\"color: #D73A49\">!=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">).astype(<\/span><span style=\"color: #005CC5\">int<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 4. Compute selection probabilities and apply threshold<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">selection_prob <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> selection_counts <\/span><span style=\"color: #D73A49\">\/<\/span><span style=\"color: #24292E\"> n_bootstrap<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">stable_features <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> np.where(selection_prob <\/span><span style=\"color: #D73A49\">&gt;=<\/span><span style=\"color: #24292E\"> threshold)&#091;<\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">&#093;<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 5. Report<\/span><\/span>\n<span class=\"line\"><span style=\"color: #005CC5\">print<\/span><span style=\"color: #24292E\">(<\/span><span style=\"color: #D73A49\">f<\/span><span style=\"color: #032F62\">&quot;Stable features (prob \u2265 <\/span><span style=\"color: #005CC5\">{<\/span><span style=\"color: #24292E\">threshold<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\">): <\/span><span style=\"color: #005CC5\">{<\/span><span style=\"color: #24292E\">stable_features<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\">&quot;<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #005CC5\">print<\/span><span style=\"color: #24292E\">(<\/span><span style=\"color: #D73A49\">f<\/span><span style=\"color: #032F62\">&quot;Selection probabilities: <\/span><span style=\"color: #005CC5\">{<\/span><span style=\"color: #24292E\">np.round(selection_prob, <\/span><span style=\"color: #005CC5\">2<\/span><span style=\"color: #24292E\">)<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\">&quot;<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #005CC5\">print<\/span><span style=\"color: #24292E\">(<\/span><span style=\"color: #D73A49\">f<\/span><span style=\"color: #032F62\">&quot;True informative features: <\/span><span style=\"color: #005CC5\">{<\/span><span style=\"color: #24292E\">np.where(true_coef <\/span><span style=\"color: #D73A49\">!=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">)&#091;<\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">&#093;<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\">&quot;<\/span><span style=\"color: #24292E\">)<\/span><\/span><\/code><\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Key Parameters<\/h2>\n\n\n\n<figure style=\"padding-right:var(--wp--preset--spacing--40);padding-left:var(--wp--preset--spacing--40)\" class=\"wp-block-table\"><table><thead><tr><th>Parameter<\/th><th>Typical Value<\/th><th>Role<\/th><\/tr><\/thead><tbody><tr><td><code>n_bootstrap<\/code> (B)<\/td><td>100<\/td><td>More iterations \u2192 smoother probability estimates<\/td><\/tr><tr><td><code>subsample_ratio<\/code><\/td><td>0.5<\/td><td>Half-sampling is standard<\/td><\/tr><tr><td><code>alpha<\/code><\/td><td>Cross-validated<\/td><td>Regularization strength of base selector<\/td><\/tr><tr><td><code>threshold<\/code> (\u03c0)<\/td><td>0.6 \u2013 0.9<\/td><td>Higher threshold \u2192 fewer but more reliable features<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">Practical Notes<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Swap <code>Lasso<\/code> with any selector (Random Forest importance, Boruta, mutual information, etc.).<\/li>\n\n\n\n<li>For stricter false discovery control, use the bound from Meinshausen &amp; B\u00fchlmann: \u03c0 \u2265 0.5 with an appropriate regularization range.<\/li>\n\n\n\n<li>The <a href=\"https:\/\/github.com\/scikit-learn-contrib\/stability-selection\" target=\"_blank\" rel=\"noopener\"><code>stability-selection<\/code><\/a> package provides a scikit-learn compatible implementation.<\/li>\n<\/ul>\n\n\n\n<!--nextpage-->\n\n\n\n<h1 class=\"wp-block-heading\">Ensemble Importance Averaging with Python Code<\/h1>\n\n\n\n<h2 class=\"wp-block-heading\">Overview<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Ensemble Importance Averaging<\/strong> is a simple yet effective technique for stabilizing feature selection. Instead of relying on a single model&#8217;s feature importances (which vary with random seed), it trains multiple models with different seeds and <strong>averages the importance scores<\/strong> across them.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Core Idea<\/h2>\n\n\n\n<ol class=\"wp-block-list\">\n<li>Train the same model N times, each with a different random seed.<\/li>\n\n\n\n<li>Extract feature importance scores from each model.<\/li>\n\n\n\n<li><strong>Average<\/strong> (or take the median of) the scores across all runs.<\/li>\n\n\n\n<li>Rank features by the averaged score and select the top-K (or apply a threshold).<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">This reduces the variance in importance estimates caused by stochastic training elements (bootstrap sampling, feature subsampling, weight initialization, etc.) and yields a more reproducible ranking.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Python Example (Random Forest-based)<\/h2>\n\n\n\n<div class=\"wp-block-kevinbatdorf-code-block-pro cbp-has-line-numbers\" data-code-block-pro-font-family=\"Code-Pro-JetBrains-Mono\" style=\"font-size:1rem;font-family:Code-Pro-JetBrains-Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;--cbp-line-number-color:#24292e;--cbp-line-number-width:calc(2 * 0.6 * 1rem);line-height:1.625rem;--cbp-tab-width:2;tab-size:var(--cbp-tab-width, 2)\"><span role=\"button\" tabindex=\"0\" style=\"color:#24292e;display:none\" aria-label=\"Copy\" class=\"code-block-pro-copy-button\"><pre class=\"code-block-pro-copy-button-pre\" aria-hidden=\"true\"><textarea class=\"code-block-pro-copy-button-textarea\" tabindex=\"-1\" aria-hidden=\"true\" readonly>import numpy as np\nimport pandas as pd\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier\n\n# 1. Generate synthetic data: 500 samples, 30 features, only 8 informative\nX, y = make_classification(\n    n_samples=500, n_features=30, n_informative=8,\n    n_redundant=2, n_repeated=0, random_state=0\n)\n\n# 2. Ensemble averaging parameters\nn_runs = 50                # number of models with different seeds\ntop_k = 10                 # number of top features to retain\n\nn_features = X.shape&#091;1&#093;\nimportance_matrix = np.zeros((n_runs, n_features))\n\n# 3. Train multiple models with different seeds\nfor i in range(n_runs):\n    rf = RandomForestClassifier(\n        n_estimators=200,\n        random_state=i,       # vary seed across runs\n        n_jobs=-1\n    ).fit(X, y)\n    importance_matrix&#091;i&#093; = rf.feature_importances_\n\n# 4. Aggregate importances\nmean_importance = importance_matrix.mean(axis=0)\nstd_importance = importance_matrix.std(axis=0)\n\n# 5. Rank and select top-K features\nranking = np.argsort(mean_importance)&#091;::-1&#093;\nselected_features = ranking&#091;:top_k&#093;\n\n# 6. Report\nresults = pd.DataFrame({\n    \"feature\": np.arange(n_features),\n    \"mean_importance\": mean_importance.round(4),\n    \"std_importance\": std_importance.round(4),\n}).sort_values(\"mean_importance\", ascending=False)\n\nprint(f\"Selected top-{top_k} features: {sorted(selected_features.tolist())}\")\nprint(results.head(top_k).to_string(index=False))<\/textarea><\/pre><svg xmlns=\"http:\/\/www.w3.org\/2000\/svg\" style=\"width:24px;height:24px\" fill=\"none\" viewBox=\"0 0 24 24\" stroke=\"currentColor\" stroke-width=\"2\"><path class=\"with-check\" stroke-linecap=\"round\" stroke-linejoin=\"round\" d=\"M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2m-6 9l2 2 4-4\"><\/path><path class=\"without-check\" stroke-linecap=\"round\" stroke-linejoin=\"round\" d=\"M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2\"><\/path><\/svg><\/span><pre class=\"shiki github-light\" style=\"background-color: #fff\" tabindex=\"0\"><code><span class=\"line\"><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> numpy <\/span><span style=\"color: #D73A49\">as<\/span><span style=\"color: #24292E\"> np<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> pandas <\/span><span style=\"color: #D73A49\">as<\/span><span style=\"color: #24292E\"> pd<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">from<\/span><span style=\"color: #24292E\"> sklearn.datasets <\/span><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> make_classification<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">from<\/span><span style=\"color: #24292E\"> sklearn.ensemble <\/span><span style=\"color: #D73A49\">import<\/span><span style=\"color: #24292E\"> RandomForestClassifier<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 1. Generate synthetic data: 500 samples, 30 features, only 8 informative<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">X, y <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> make_classification(<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #E36209\">n_samples<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">500<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">n_features<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">30<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">n_informative<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">8<\/span><span style=\"color: #24292E\">,<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #E36209\">n_redundant<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">2<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">n_repeated<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">random_state<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">0<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 2. Ensemble averaging parameters<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">n_runs <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">50<\/span><span style=\"color: #24292E\">                <\/span><span style=\"color: #6A737D\"># number of models with different seeds<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">top_k <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">10<\/span><span style=\"color: #24292E\">                 <\/span><span style=\"color: #6A737D\"># number of top features to retain<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">n_features <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> X.shape&#091;<\/span><span style=\"color: #005CC5\">1<\/span><span style=\"color: #24292E\">&#093;<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">importance_matrix <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> np.zeros((n_runs, n_features))<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 3. Train multiple models with different seeds<\/span><\/span>\n<span class=\"line\"><span style=\"color: #D73A49\">for<\/span><span style=\"color: #24292E\"> i <\/span><span style=\"color: #D73A49\">in<\/span><span style=\"color: #24292E\"> <\/span><span style=\"color: #005CC5\">range<\/span><span style=\"color: #24292E\">(n_runs):<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    rf <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> RandomForestClassifier(<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">        <\/span><span style=\"color: #E36209\">n_estimators<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">200<\/span><span style=\"color: #24292E\">,<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">        <\/span><span style=\"color: #E36209\">random_state<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\">i,       <\/span><span style=\"color: #6A737D\"># vary seed across runs<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">        <\/span><span style=\"color: #E36209\">n_jobs<\/span><span style=\"color: #D73A49\">=-<\/span><span style=\"color: #005CC5\">1<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    ).fit(X, y)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    importance_matrix&#091;i&#093; <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> rf.feature_importances_<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 4. Aggregate importances<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">mean_importance <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> importance_matrix.mean(<\/span><span style=\"color: #E36209\">axis<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">std_importance <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> importance_matrix.std(<\/span><span style=\"color: #E36209\">axis<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">0<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 5. Rank and select top-K features<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">ranking <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> np.argsort(mean_importance)&#091;::<\/span><span style=\"color: #D73A49\">-<\/span><span style=\"color: #005CC5\">1<\/span><span style=\"color: #24292E\">&#093;<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">selected_features <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> ranking&#091;:top_k&#093;<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #6A737D\"># 6. Report<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">results <\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #24292E\"> pd.DataFrame({<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #032F62\">&quot;feature&quot;<\/span><span style=\"color: #24292E\">: np.arange(n_features),<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #032F62\">&quot;mean_importance&quot;<\/span><span style=\"color: #24292E\">: mean_importance.round(<\/span><span style=\"color: #005CC5\">4<\/span><span style=\"color: #24292E\">),<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">    <\/span><span style=\"color: #032F62\">&quot;std_importance&quot;<\/span><span style=\"color: #24292E\">: std_importance.round(<\/span><span style=\"color: #005CC5\">4<\/span><span style=\"color: #24292E\">),<\/span><\/span>\n<span class=\"line\"><span style=\"color: #24292E\">}).sort_values(<\/span><span style=\"color: #032F62\">&quot;mean_importance&quot;<\/span><span style=\"color: #24292E\">, <\/span><span style=\"color: #E36209\">ascending<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">False<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><\/span>\n<span class=\"line\"><span style=\"color: #005CC5\">print<\/span><span style=\"color: #24292E\">(<\/span><span style=\"color: #D73A49\">f<\/span><span style=\"color: #032F62\">&quot;Selected top-<\/span><span style=\"color: #005CC5\">{<\/span><span style=\"color: #24292E\">top_k<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\"> features: <\/span><span style=\"color: #005CC5\">{sorted<\/span><span style=\"color: #24292E\">(selected_features.tolist())<\/span><span style=\"color: #005CC5\">}<\/span><span style=\"color: #032F62\">&quot;<\/span><span style=\"color: #24292E\">)<\/span><\/span>\n<span class=\"line\"><span style=\"color: #005CC5\">print<\/span><span style=\"color: #24292E\">(results.head(top_k).to_string(<\/span><span style=\"color: #E36209\">index<\/span><span style=\"color: #D73A49\">=<\/span><span style=\"color: #005CC5\">False<\/span><span style=\"color: #24292E\">))<\/span><\/span><\/code><\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Key Parameters<\/h2>\n\n\n\n<figure style=\"padding-right:var(--wp--preset--spacing--40);padding-left:var(--wp--preset--spacing--40)\" class=\"wp-block-table\"><table><thead><tr><th>Parameter<\/th><th>Typical Value<\/th><th>Role<\/th><\/tr><\/thead><tbody><tr><td><code>n_runs<\/code><\/td><td>30 \u2013 100<\/td><td>More runs \u2192 more stable averaging<\/td><\/tr><tr><td><code>random_state<\/code><\/td><td>Varied per run<\/td><td>Ensures diversity across models<\/td><\/tr><tr><td><code>top_k<\/code> or threshold<\/td><td>Domain-dependent<\/td><td>Defines how many features to keep<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">Aggregation Options<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Mean<\/strong>: Standard choice; sensitive to outliers.<\/li>\n\n\n\n<li><strong>Median<\/strong>: More robust when some runs produce extreme values.<\/li>\n\n\n\n<li><strong>Rank averaging<\/strong>: Convert each run&#8217;s importances to ranks, then average \u2014 useful when importance scales differ across models.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">Practical Notes<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Works with any model exposing feature importances: <code>RandomForest<\/code>, <code>GradientBoosting<\/code>, <code>XGBoost<\/code>, <code>LightGBM<\/code>, or coefficient-based models (e.g., Lasso).<\/li>\n\n\n\n<li>For deep learning or model-agnostic setups, combine with <strong>permutation importance<\/strong> or <strong>SHAP values<\/strong> across runs.<\/li>\n\n\n\n<li>The <strong>standard deviation<\/strong> of importance across runs provides a useful uncertainty measure \u2014 features with high mean but also high std may be unreliable.<\/li>\n\n\n\n<li>Computationally heavier than a single run; consider parallelization with <code>joblib<\/code> or <code>n_jobs=-1<\/code>.<\/li>\n<\/ul>\n<div style='text-align:center' class='yasr-auto-insert-overall'><\/div><div style='text-align:center' class='yasr-auto-insert-visitor'><\/div>","protected":false},"excerpt":{"rendered":"<p>Delicate dandelion seed heads Random Seed Sensitivity in Feature Selection 1. Problem Statement Feature selection results can vary significantly depending on the random seed used during model training, data splitting, or stochastic optimization. This instability undermines reproducibility and trustworthiness, particularly in high-stakes domains such as healthcare and genomics where consistent feature identification is critical for&#8230;<\/p>\n","protected":false},"author":4,"featured_media":6245,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_bbp_topic_count":0,"_bbp_reply_count":0,"_bbp_total_topic_count":0,"_bbp_total_reply_count":0,"_bbp_voice_count":0,"_bbp_anonymous_reply_count":0,"_bbp_topic_count_hidden":0,"_bbp_reply_count_hidden":0,"_bbp_forum_subforum_count":0,"_kadence_starter_templates_imported_post":false,"_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","yasr_overall_rating":0,"yasr_post_is_review":"","yasr_auto_insert_disabled":"","yasr_review_type":"","fifu_image_url":"","fifu_image_alt":"","iawp_total_views":0,"footnotes":""},"categories":[56,373],"tags":[],"class_list":["post-6235","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-data-science-slug","category-feature-engineering-slug"],"yasr_visitor_votes":{"stars_attributes":{"read_only":false,"span_bottom":false},"number_of_votes":0,"sum_votes":0},"jetpack_featured_media_url":"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/warm-floating-dandelion-seeds.jpg","_links":{"self":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6235","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/users\/4"}],"replies":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/comments?post=6235"}],"version-history":[{"count":8,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6235\/revisions"}],"predecessor-version":[{"id":6260,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6235\/revisions\/6260"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/media\/6245"}],"wp:attachment":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/media?parent=6235"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/categories?post=6235"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/tags?post=6235"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}