{"id":6409,"date":"2026-04-24T18:20:59","date_gmt":"2026-04-24T23:20:59","guid":{"rendered":"https:\/\/ykim.synology.me\/wordpress\/?p=6409"},"modified":"2026-04-24T20:34:11","modified_gmt":"2026-04-25T01:34:11","slug":"ml-methodology-taxonomy-for-within-wafer-variation-prediction","status":"publish","type":"post","link":"https:\/\/ykim.synology.me\/wordpress\/ml-methodology-taxonomy-for-within-wafer-variation-prediction-6409\/","title":{"rendered":"ML Methodology: Taxonomy for Within-Wafer Variation Prediction"},"content":{"rendered":"<style>.kadence-column6409_8067df-41 > .kt-inside-inner-col{display:flex;}.kadence-column6409_8067df-41 > .kt-inside-inner-col,.kadence-column6409_8067df-41 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_8067df-41 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_8067df-41 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;align-items:flex-end;justify-content:flex-start;}.kadence-column6409_8067df-41 > .kt-inside-inner-col > *, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6409_8067df-41 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}.kt-row-column-wrap > .kadence-column6409_8067df-41{align-self:flex-end;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41{align-self:auto;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41 > .kt-inside-inner-col{align-items:flex-end;}.kadence-column6409_8067df-41 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_8067df-41{position:relative;}@media all and (max-width: 1024px){.kt-row-column-wrap > .kadence-column6409_8067df-41{align-self:flex-end;}}@media all and (max-width: 1024px){.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41{align-self:auto;}}@media all and (max-width: 1024px){.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41 > .kt-inside-inner-col{align-items:flex-end;}}@media all and (max-width: 1024px){.kadence-column6409_8067df-41 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;align-items:flex-end;justify-content:flex-start;}}@media all and (min-width: 768px) and (max-width: 1024px){.kadence-column6409_8067df-41 > .kt-inside-inner-col > *, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6409_8067df-41 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}}@media all and (max-width: 767px){.kt-row-column-wrap > .kadence-column6409_8067df-41{align-self:flex-end;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41{align-self:auto;}.kt-inner-column-height-full:not(.kt-has-1-columns) > .wp-block-kadence-column.kadence-column6409_8067df-41 > .kt-inside-inner-col{align-items:flex-end;}.kadence-column6409_8067df-41 > .kt-inside-inner-col{flex-direction:row;flex-wrap:wrap;justify-content:flex-start;justify-content:flex-start;}.kadence-column6409_8067df-41 > .kt-inside-inner-col > *, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-image, .kadence-column6409_8067df-41 > .kt-inside-inner-col > figure.wp-block-kadence-image{margin-top:0px;margin-bottom:0px;}.kadence-column6409_8067df-41 > .kt-inside-inner-col > .kb-image-is-ratio-size{flex-grow:1;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_8067df-41 kb-section-dir-horizontal\"><div class=\"kt-inside-inner-col\">\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"1024\" src=\"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px.jpg\" alt=\"\" class=\"wp-image-6399\" style=\"width:600px\" srcset=\"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px.jpg 1024w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px-300x300.jpg 300w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px-150x150.jpg 150w, https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px-768x768.jpg 768w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"has-theme-palette-6-color has-text-color has-link-color wp-elements-402d4544f828caae64b59137ac711142 wp-block-paragraph\" style=\"font-size:8px\">the far side of the Moon<\/p>\n<\/div><\/div>\n\n\n\n<p class=\"wp-block-paragraph\" style=\"margin-top:var(--wp--preset--spacing--50)\">This report presents a structured taxonomy of machine learning methodologies specialized for Within-Wafer (WIW) variation prediction in semiconductor manufacturing. General-purpose ML approaches often fail to exploit the unique spatial structure of wafer data: circular geometry, process-induced radial symmetry, and strong inter-site correlations. The seven categories below organize WIW-specific methods into methodological primitives that can be combined in practical pipelines.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Scope and Motivation<\/h2>\n\n\n<style>.kadence-column6409_acc7ef-88 > .kt-inside-inner-col,.kadence-column6409_acc7ef-88 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_acc7ef-88 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_acc7ef-88 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_acc7ef-88 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_acc7ef-88 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_acc7ef-88{position:relative;}.kadence-column6409_acc7ef-88, .kt-inside-inner-col > .kadence-column6409_acc7ef-88:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_acc7ef-88 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_acc7ef-88 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_acc7ef-88\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\">Within-wafer variation refers to systematic and random deviations of a measurement parameter (film thickness, critical dimension, overlay, material removal rate, etc.) across multiple sites on a single wafer. Typical setups measure $N_{sites}$ locations per wafer (commonly 9, 13, 17, 25, or 49 sites), with $N_{wafers}$ ranging from hundreds to tens of thousands depending on the fab context.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The central modeling challenge is:<\/p>\n\n\n\n$$T(r, \\theta, \\mathbf{s}) = T_{mean}(\\mathbf{s}) + T_{WIW}(r, \\theta, \\mathbf{s}) + \\epsilon$$\n\n\n\n<p class=\"wp-block-paragraph\">where $T$ is the measured parameter at polar coordinates $(r, \\theta)$, $\\mathbf{s}$ denotes sensor\/process features, $T_{mean}$ captures wafer-level behavior, $T_{WIW}$ captures spatial variation across the wafer, and $\\epsilon$ is irreducible noise. A good WIW-ML framework must model $T_{WIW}$ while respecting wafer geometry and the small-data regime.<\/p>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">A. Spatial Basis Decomposition<\/h2>\n\n\n<style>.kadence-column6409_49b1c0-46 > .kt-inside-inner-col,.kadence-column6409_49b1c0-46 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_49b1c0-46 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_49b1c0-46 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_49b1c0-46 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_49b1c0-46 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_49b1c0-46{position:relative;}.kadence-column6409_49b1c0-46, .kt-inside-inner-col > .kadence-column6409_49b1c0-46:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_49b1c0-46 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_49b1c0-46 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_49b1c0-46\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Decompose the wafer spatial pattern into a linear combination of basis functions and learn the coefficients rather than raw site values. By replacing $N_{sites}$ raw targets with $K$ basis coefficients (typically $K \\ll N_{sites}$), target dimensionality is reduced and physical interpretation becomes possible. Zernike polynomials provide an orthonormal basis on the unit disk that perfectly matches wafer geometry, while PCA and DCT offer data-driven bases. A small number of coefficients typically explains over 90% of wafer variation, making this approach highly effective in small-data regimes. Scanner Advanced Process Control (APC) systems already operate on this principle in high-volume manufacturing (Noh 2018).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n$$T(r, \\theta) = \\sum_{k=1}^{K} c_{k} \\cdot \\psi_{k}(r, \\theta)$$\n\n\n\n<p class=\"wp-block-paragraph\">where $\\psi_{k}$ are basis functions (Zernike, polynomial, DCT, PCA) and $c_{k}$ are the coefficients that become the new ML targets:<\/p>\n\n\n\n$$\\hat{\\mathbf{c}} = f_{ML}(\\mathbf{s}), \\quad \\hat{T}(r, \\theta) = \\sum_{k=1}^{K} \\hat{c}_{k} \\cdot \\psi_{k}(r, \\theta)$$\n\n\n\n<p class=\"wp-block-paragraph\">The Zernike basis satisfies orthogonality on the unit disk:<\/p>\n\n\n\n$$\\int_{0}^{1} \\int_{0}^{2\\pi} Z_{n,m}(r, \\theta) \\cdot Z_{n&#8217;,m&#8217;}(r, \\theta) \\cdot r \\, dr \\, d\\theta = \\frac{\\pi}{2n+2} \\delta_{n,n&#8217;} \\delta_{m,m&#8217;}$$\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   13 sites                         Basis coefficients\n   +-----------+                    +----------------+\n   | o o o o o |                    | c1 (piston)    |\n   |o * * * * o|  --- fitting --&gt;   | c2 (tilt_x)    |\n   |o * @ * * o|                    | c3 (tilt_y)    |\n   |o * * * * o|                    | c4 (bowl)      |\n   | o o o o o |                    | c5 (astigmatism)|\n   +-----------+                    +----------------+\n   raw targets                      learn THESE\n       |                                    |\n       |                                    v\n       |                        Sensors -&gt; ML -&gt; c_hat\n       |                                    |\n       +------ reconstruct &lt;----------------+<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Zernike polynomial decomposition \u2014 Noh 2018 (overlay APC in HVM)<\/li>\n\n\n\n<li>Radial-azimuthal polynomial decomposition \u2014 Dwivedi 2023<\/li>\n\n\n\n<li>2D bivariate polynomial (2nd order) \u2014 Dwivedi 2023<\/li>\n\n\n\n<li>DCT basis with sparsity prior \u2014 Zhang 2011<\/li>\n\n\n\n<li>PCA \/ Karhunen-Lo\u00e8ve on wafer maps \u2014 Kazemi 2020<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">B. Spatial Correlation Modeling (Gaussian Process Family)<\/h2>\n\n\n<style>.kadence-column6409_66123f-93 > .kt-inside-inner-col,.kadence-column6409_66123f-93 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_66123f-93 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_66123f-93 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_66123f-93 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_66123f-93 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_66123f-93{position:relative;}.kadence-column6409_66123f-93, .kt-inside-inner-col > .kadence-column6409_66123f-93:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_66123f-93 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_66123f-93 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_66123f-93\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Encode the natural assumption that nearby wafer locations have correlated measurements into a kernel function. Gaussian Processes (GPs) provide not only point predictions but also predictive uncertainty, which is essential for fallback decisions in Virtual Metrology. Multi-task GPs (MTGPs) extend this by jointly predicting multiple sites while learning inter-site correlations through a coregionalization matrix. Hierarchical GPs further handle discontinuities between site clusters caused by probe-card parasitics or different measurement systems (Shintani 2021). This is the most validated approach in the hundreds-to-thousands wafer regime.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">For a standard spatial GP:<\/p>\n\n\n\n$$f(\\mathbf{x}) \\sim \\mathcal{GP}(m(\\mathbf{x}),\\ k(\\mathbf{x}, \\mathbf{x}&#8217;))$$\n\n\n\n<p class=\"wp-block-paragraph\">with Matern 5\/2 kernel commonly used for wafer spatial correlation:<\/p>\n\n\n\n$$k_{5\/2}(\\mathbf{x}, \\mathbf{x}&#8217;) = \\sigma^{2} \\left(1 + \\frac{\\sqrt{5}d}{\\ell} + \\frac{5d^{2}}{3\\ell^{2}}\\right) \\exp\\left(-\\frac{\\sqrt{5}d}{\\ell}\\right)$$\n\n\n\n<p class=\"wp-block-paragraph\">where $d = \\|\\mathbf{x} &#8211; \\mathbf{x}&#8217;\\|$ and $\\ell$ is the length scale.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Multi-task GP with Intrinsic Coregionalization Model (ICM):<\/p>\n\n\n\n$$k_{MTGP}((\\mathbf{x}, i),\\ (\\mathbf{x}&#8217;, j)) = k_{process}(\\mathbf{x}, \\mathbf{x}&#8217;) \\cdot B[i, j]$$\n\n\n\n<p class=\"wp-block-paragraph\">where $B \\in \\mathbb{R}^{N_{sites} \\times N_{sites}}$ is a learned positive semi-definite matrix capturing site-to-site correlations, and $i, j$ index the sites.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The predictive distribution at a new point is:<\/p>\n\n\n\n$$p(f_{*} | \\mathbf{X}, \\mathbf{y}, \\mathbf{x}_{*}) = \\mathcal{N}(\\mu_{*},\\ \\sigma_{*}^{2})$$\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">         kernel k(x, x') measures similarity\n              +---------------------+\n              |                     |\n         *----+--*                  |   high correlation\n         site1  site2               |   (nearby)\n              |                     |\n              |         *-----------+--*   low correlation\n              |         site3       site13   (far apart)\n              +---------------------+\n\n   13 x 13 task correlation matrix B (learned):\n   +-------------------------------+\n   | 1.0  0.9  0.8  ...  0.2       |  &lt;- site correlations\n   | 0.9  1.0  0.9  ...  0.3       |     captured automatically\n   |  :                      :     |\n   +-------------------------------+\n\n   Prediction: mu(x*) +\/- sigma(x*)   &lt;- uncertainty included<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Kriging for spatial interpolation \u2014 Cressie 1993<\/li>\n\n\n\n<li>Multi-task GP with coregionalization \u2014 Bonilla 2008<\/li>\n\n\n\n<li>Spatial GP for wafer e-test with variance decomposition \u2014 Reda 2010<\/li>\n\n\n\n<li>Multi-task GP for CMP with uncertainty \u2014 Cai 2020<\/li>\n\n\n\n<li>Hierarchical GP for multi-site RF testing \u2014 Shintani 2021<\/li>\n\n\n\n<li>Multilevel kernel methods for VM \u2014 Schirru 2011<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">C. Hierarchical Variation Decomposition<\/h2>\n\n\n<style>.kadence-column6409_67560e-5c > .kt-inside-inner-col,.kadence-column6409_67560e-5c > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_67560e-5c > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_67560e-5c > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_67560e-5c > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_67560e-5c > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_67560e-5c{position:relative;}.kadence-column6409_67560e-5c, .kt-inside-inner-col > .kadence-column6409_67560e-5c:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_67560e-5c > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_67560e-5c > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_67560e-5c\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Decompose total variation into physically meaningful components and apply ML only to the predictable ones. A typical decomposition separates layout-dependent effects, Intra-Wafer Systematic (IWS) variation, Wafer-to-Wafer (WTW) mean shifts, and random residuals, each corresponding to distinct physical origins (Dwivedi 2023). This prevents ML models from overfitting random noise and enables per-component model selection. Combined with ANOVA-style variance decomposition, it also provides diagnostic insight into which variation source dominates. This framework is the de facto standard in silicon photonics and mature fab variation analysis.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n$$T_{total}(r, \\theta, t) = T_{layout}(r, \\theta) + T_{IWS}(r, \\theta) + T_{WTW}(t) + \\epsilon$$\n\n\n\n<p class=\"wp-block-paragraph\">Variance decomposition under orthogonality assumptions:<\/p>\n\n\n\n$$\\sigma_{total}^{2} = \\sigma_{layout}^{2} + \\sigma_{IWS}^{2} + \\sigma_{WTW}^{2} + \\sigma_{random}^{2}$$\n\n\n\n<p class=\"wp-block-paragraph\">Each component is modeled separately:<\/p>\n\n\n\n$$\\hat{T}_{IWS}(r, \\theta) = \\sum_{k=1}^{K_{IWS}} c_{k}^{IWS}(\\mathbf{s}) \\cdot \\psi_{k}(r, \\theta)$$\n\n\n\n$$\\hat{T}_{WTW}(t) = f_{WTW}(\\mathbf{s}_{tool}(t))$$\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   Total Variation (observed)\n            |\n            v\n   +-------------------------------------+\n   |  Layout-dependent (pattern density) |  &lt;- physical model\n   +-------------------------------------+\n   |  IWS (Intra-Wafer Systematic)       |  &lt;- polynomial\/Zernike\n   |    radial + bivariate polynomial    |     ML predicts this\n   +-------------------------------------+\n   |  WTW (Wafer-to-Wafer mean shift)    |  &lt;- tool state features\n   |                                     |     ML predicts this\n   +-------------------------------------+\n   |  Random residual                    |  &lt;- NOT predicted,\n   |                                     |     monitored as Cpk\n   +-------------------------------------+\n\n   sigma^2_total = sigma^2_layout + sigma^2_IWS\n                 + sigma^2_WTW + sigma^2_random<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Hierarchical model for photonic variation \u2014 Dwivedi 2023<\/li>\n\n\n\n<li>Spatial estimation and variance decomposition \u2014 Reda 2010<\/li>\n\n\n\n<li>Mixed-effect profile monitoring \u2014 Liu 2022<\/li>\n\n\n\n<li>Hierarchical multi-task learning for wafer quality \u2014 He 2018<\/li>\n\n\n\n<li>Multiplicative Product Factor for hierarchy \u2014 Rothe 2025<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">D. Compressed Sensing and Sparse Recovery<\/h2>\n\n\n<style>.kadence-column6409_030649-d2 > .kt-inside-inner-col,.kadence-column6409_030649-d2 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_030649-d2 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_030649-d2 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_030649-d2 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_030649-d2 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_030649-d2{position:relative;}.kadence-column6409_030649-d2, .kt-inside-inner-col > .kadence-column6409_030649-d2:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_030649-d2 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_030649-d2 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_030649-d2\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Exploit the empirical observation that wafer spatial patterns are sparse in a transform domain (DCT, Fourier, wavelet). By $L_{1}$ regularization, the full wafer can be recovered from far fewer samples than the Nyquist rate would require. Unlike the Gaussian Process family, which uses spatial correlation as prior, Compressed Sensing uses <strong>frequency-domain sparsity<\/strong> as prior (Zhang 2011). The problem reduces to linear programming and runs efficiently. In industrial data, delay measurements on 50 dies reconstructed the delay of 269 dies on the same wafer, with up to 10x error reduction compared to 2D interpolation and Kriging. This is the primary framework for test cost reduction in production testing.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Let $\\mathbf{g} \\in \\mathbb{R}^{PQ}$ be the vectorized full wafer map and $\\boldsymbol{\\eta}$ its DCT coefficients:<\/p>\n\n\n\n$$\\mathbf{g} = \\mathbf{\\Psi} \\boldsymbol{\\eta}, \\quad \\mathbf{\\Psi} = \\text{inverse DCT matrix}$$\n\n\n\n<p class=\"wp-block-paragraph\">Given sparse measurements $\\mathbf{y} = \\mathbf{A} \\mathbf{g}$ where $\\mathbf{A}$ is a selection matrix, recover $\\boldsymbol{\\eta}$ by:<\/p>\n\n\n\n$$\\hat{\\boldsymbol{\\eta}} = \\arg\\min_{\\boldsymbol{\\eta}} \\|\\boldsymbol{\\eta}\\|_{1} \\quad \\text{s.t.} \\quad \\mathbf{y} = \\mathbf{A} \\mathbf{\\Psi} \\boldsymbol{\\eta}$$\n\n\n\n<p class=\"wp-block-paragraph\">In practice, the constrained form is relaxed to:<\/p>\n\n\n\n$$\\hat{\\boldsymbol{\\eta}} = \\arg\\min_{\\boldsymbol{\\eta}} \\left\\{ \\|\\mathbf{y} &#8211; \\mathbf{A} \\mathbf{\\Psi} \\boldsymbol{\\eta}\\|_{2}^{2} + \\lambda \\|\\boldsymbol{\\eta}\\|_{1} \\right\\}$$\n\n\n\n<p class=\"wp-block-paragraph\">and $\\lambda$ is selected via cross-validation. The full wafer is reconstructed as $\\hat{\\mathbf{g}} = \\mathbf{\\Psi} \\hat{\\boldsymbol{\\eta}}$.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   Wafer measurement              DCT coefficients\n   +-------------+               +----------------+\n   | . . * . . * |               | ##..........   |  &lt;- most near zero\n   | * . . . * . |   --- DCT --&gt; | #...........   |     (SPARSE!)\n   | . * . * . . |               | ............   |\n   | . . * . . * |               | ............   |\n   +-------------+               +----------------+\n   few measured dies              prior: sparsity\n         |                               |\n         |                               v\n         |           solve: min ||eta||_1  s.t.  y = A*Psi*eta\n         |                  (linear programming)\n         |                               |\n         +--- reconstruct full wafer &lt;---+\n                   via inverse DCT<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Virtual Probe (DCT + L1) \u2014 Zhang 2011<\/li>\n\n\n\n<li>Bayesian Virtual Probe \u2014 Zhang 2010<\/li>\n\n\n\n<li>Joint Virtual Probe for multi-item correlation \u2014 Zhang 2014<\/li>\n\n\n\n<li>Multi-Wafer Virtual Probe with WTW correlation \u2014 Zhang 2012<\/li>\n\n\n\n<li>3D compressed sensing with KLT \u2014 Ahmadi 2015<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">E. Physics-Informed and Hybrid Approaches<\/h2>\n\n\n<style>.kadence-column6409_707e73-3c > .kt-inside-inner-col,.kadence-column6409_707e73-3c > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_707e73-3c > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_707e73-3c > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_707e73-3c > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_707e73-3c > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_707e73-3c{position:relative;}.kadence-column6409_707e73-3c, .kt-inside-inner-col > .kadence-column6409_707e73-3c:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_707e73-3c > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_707e73-3c > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_707e73-3c\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Embed PDEs, conservation laws, or reaction kinetics directly into neural network loss functions or architectures. Pure data-driven models fail when wafer counts are limited, but physics acts as a regularizer, enabling full-field prediction from sparse sensors. Physics-Informed Neural Networks (PINNs) enforce equation residuals, while Fourier Neural Operators (FNOs) learn the solution operator of a PDE family (Go 2025). Training is slow but inference is faster than traditional numerical solvers. This approach is particularly valuable for new processes with limited data, and for processes where physical models are well established (heat transfer, diffusion, plasma kinetics).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">For a PDE of the form $\\mathcal{N}[T](\\mathbf{x}, t) = 0$, the PINN loss is:<\/p>\n\n\n\n$$\\mathcal{L} = \\mathcal{L}_{data} + \\lambda \\cdot \\mathcal{L}_{physics}$$\n\n\n\n$$\\mathcal{L}_{data} = \\frac{1}{N_{d}} \\sum_{i=1}^{N_{d}} \\left( T_{\\theta}(\\mathbf{x}_{i}, t_{i}) &#8211; T_{i}^{obs} \\right)^{2}$$\n\n\n\n$$\\mathcal{L}_{physics} = \\frac{1}{N_{c}} \\sum_{j=1}^{N_{c}} \\left( \\mathcal{N}[T_{\\theta}](\\mathbf{x}_{j}, t_{j}) \\right)^{2}$$\n\n\n\n<p class=\"wp-block-paragraph\">For wafer thermal dynamics (e.g., PEB step), the governing equation is:<\/p>\n\n\n\n$$\\frac{\\partial T}{\\partial t} &#8211; \\alpha \\nabla^{2} T = Q(\\mathbf{x}, t)$$\n\n\n\n<p class=\"wp-block-paragraph\">Fourier Neural Operator learns the mapping:<\/p>\n\n\n\n$$\\mathcal{G}_{\\theta}(a)(\\mathbf{y}) = \\sum_{k=1}^{p} b_{k}(a(\\mathbf{x}_{1}),\\ \\ldots,\\ a(\\mathbf{x}_{m})) \\cdot t_{k}(\\mathbf{y})$$\n\n\n\n<p class=\"wp-block-paragraph\">where $a$ is the input function (e.g., temperature field) and $\\mathcal{G}_{\\theta}$ maps it to the output function (e.g., warpage field).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   PINN loss combines data and physics:\n   +-------------------------------------------------+\n   |  L = L_data  +  lambda * L_physics              |\n   |      ------      ---------------                |\n   |      MSE on      PDE residual                   |\n   |      measured    (e.g. dT\/dt - alpha*nabla^2 T) |\n   |      points      enforced everywhere            |\n   +-------------------------------------------------+\n\n   Sparse sensors         Full wafer field\n   +-----------+          +--------------+\n   | * . . * . |          | ~~~~~~~~~~~~ |\n   | . . * . . | --PINN--&gt;| ~~~~~~~~~~~~ |  &lt;- continuous T(x,y)\n   | * . . . . |          | ~~~~~~~~~~~~ |     warpage W(x,y)\n   | . * . * . |          | ~~~~~~~~~~~~ |\n   +-----------+          +--------------+\n   6 points                full continuous field\n                           (physics-constrained)<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>PINN for semiconductor deposition review \u2014 Han 2025<\/li>\n\n\n\n<li>PBSM + Fourier Neural Operator for PEB process \u2014 Go 2025<\/li>\n\n\n\n<li>Neural Master Equation for plasma etch kinetics \u2014 Kim 2025<\/li>\n\n\n\n<li>ACO + BPNN for SiC epitaxy uniformity optimization \u2014 Zhang 2024<\/li>\n\n\n\n<li>CFD + ANN hybrid for ALD SiO2 \u2014 Li 2022<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">F. Multi-task and Multi-output Learning (Non-GP)<\/h2>\n\n\n<style>.kadence-column6409_184fc3-69 > .kt-inside-inner-col,.kadence-column6409_184fc3-69 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_184fc3-69 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_184fc3-69 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_184fc3-69 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_184fc3-69 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_184fc3-69{position:relative;}.kadence-column6409_184fc3-69, .kt-inside-inner-col > .kadence-column6409_184fc3-69:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_184fc3-69 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_184fc3-69 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_184fc3-69\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Predict multiple sites or multiple measurements jointly with a single model that learns a shared representation across tasks. Unlike GP-based methods, which express correlation through a kernel, neural-network or gradient-boosting multi-output models capture correlation implicitly through shared parameters. The canonical architecture is a shared backbone with task-specific heads; regularization-based alternatives include multi-task Lasso. Multi-task learning acts as regularization and typically improves accuracy over single-task models when per-site wafer counts are low. CatBoost MultiRMSE is the tree-based representative (Dorogush 2018).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Shared-encoder multi-task architecture:<\/p>\n\n\n\n$$\\mathbf{z} = f_{shared}(\\mathbf{s}; \\boldsymbol{\\theta}_{shared})$$\n\n\n\n$$\\hat{y}_{i} = h_{i}(\\mathbf{z}; \\boldsymbol{\\theta}_{i}), \\quad i = 1, \\ldots, N_{sites}$$\n\n\n\n<p class=\"wp-block-paragraph\">Joint loss with uncertainty weighting (Kendall 2018):<\/p>\n\n\n\n$$\\mathcal{L} = \\sum_{i=1}^{N_{sites}} \\frac{1}{2\\sigma_{i}^{2}} \\mathcal{L}_{i}(\\boldsymbol{\\theta}_{shared}, \\boldsymbol{\\theta}_{i}) + \\log \\sigma_{i}$$\n\n\n\n<p class=\"wp-block-paragraph\">where $\\sigma_{i}$ is a learned per-task uncertainty that automatically balances the multi-task loss.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Multi-task Lasso formulation:<\/p>\n\n\n\n$$\\hat{\\mathbf{W}} = \\arg\\min_{\\mathbf{W}} \\left\\{ \\sum_{i=1}^{N_{sites}} \\|\\mathbf{y}_{i} &#8211; \\mathbf{X} \\mathbf{w}_{i}\\|_{2}^{2} + \\lambda \\sum_{j=1}^{p} \\|\\mathbf{W}_{j,:}\\|_{2} \\right\\}$$\n\n\n\n<p class=\"wp-block-paragraph\">The group-L2 penalty encourages shared feature selection across all sites.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   Shared encoder learns common representation\n\n   Sensor features\n        |\n        v\n   +-------------------+\n   |  Shared backbone  |  &lt;- learns process physics\n   |   (MLP \/ CNN)     |     common across all sites\n   +--------+----------+\n            |\n     +------+------+------+------+\n     v      v      v      v      v\n   +---+ +---+  +---+  +---+  +---+\n   | H1| | H2|  | H3|  ...   | H13|  &lt;- task-specific heads\n   +-+-+ +-+-+  +-+-+         +-+-+     (site-level detail)\n     v      v      v             v\n    y1     y2     y3     ...    y13\n   site1  site2  site3          site13<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Multi-task VM across chambers \u2014 Park 2018<\/li>\n\n\n\n<li>Multi-level Lasso for VM \u2014 Schirru 2011<\/li>\n\n\n\n<li>Hierarchical multi-task for wafer quality \u2014 He 2018<\/li>\n\n\n\n<li>Multi-task uncertainty weighting \u2014 Kendall 2018<\/li>\n\n\n\n<li>CatBoost MultiRMSE \u2014 Dorogush 2018<\/li>\n\n\n\n<li>Shared encoder with site heads for thin film VM \u2014 Liu 2025<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">G. Ensemble and Hybrid (Mix and Match)<\/h2>\n\n\n<style>.kadence-column6409_c50279-92 > .kt-inside-inner-col,.kadence-column6409_c50279-92 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_c50279-92 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_c50279-92 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_c50279-92 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_c50279-92 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_c50279-92{position:relative;}.kadence-column6409_c50279-92, .kt-inside-inner-col > .kadence-column6409_c50279-92:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_c50279-92 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_c50279-92 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_c50279-92\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\"><strong>Core idea<\/strong>: Combine the preceding primitives to compensate for individual weaknesses. Typical patterns include running direct WIWNU prediction and site-level prediction in parallel for cross-validation of error characteristics, or ensembling GP uncertainty with gradient-boosting accuracy. Another common pattern is applying different models to different layers of a hierarchical decomposition: polynomial fitting for IWS, gradient boosting for WTW, and GP for residual uncertainty. <strong>Most production-grade papers fall into this category<\/strong>; pure single-primitive papers are the minority. Mastery of individual primitives is a prerequisite for designing effective hybrid pipelines.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Mathematical formulation<\/strong>:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">General ensemble prediction:<\/p>\n\n\n\n$$\\hat{T}_{ensemble}(r, \\theta) = \\sum_{m=1}^{M} w_{m} \\cdot \\hat{T}_{m}(r, \\theta)$$\n\n\n\n<p class=\"wp-block-paragraph\">with weights learned by stacking or fixed by domain knowledge.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Hierarchical hybrid pipeline:<\/p>\n\n\n\n$$\\hat{T}(r, \\theta) = \\underbrace{f_{A}(\\mathbf{s})}_{wafer\\ mean} + \\underbrace{\\sum_{k=1}^{K} f_{B,k}(\\mathbf{s}) \\cdot \\psi_{k}(r, \\theta)}_{Zernike\\ reconstruction} + \\underbrace{f_{C}(r, \\theta, \\mathbf{s})}_{GP\\ residual}$$\n\n\n\n<p class=\"wp-block-paragraph\">Cross-validation between direct and derived WIWNU:<\/p>\n\n\n\n$$\\widehat{WIWNU}_{direct} = f_{direct}(\\mathbf{s})$$\n\n\n\n$$\\widehat{WIWNU}_{derived} = \\frac{\\text{std}(\\hat{T}_{1}, \\ldots, \\hat{T}_{N_{sites}})}{\\text{mean}(\\hat{T}_{1}, \\ldots, \\hat{T}_{N_{sites}})}$$\n\n\n\n<p class=\"wp-block-paragraph\">Large discrepancy between the two indicates prediction unreliability and triggers a real measurement fallback.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Conceptual diagram<\/strong>:<\/p>\n\n\n\n<pre style=\"font-family: Consolas, Monaco, 'Courier New', monospace; background-color: #fff; padding: 16px; border-radius: 6px; overflow-x: auto; line-height: 1.4; font-size: 0.9em; white-space: pre; border: none;\">   Multiple primitives combined in pipeline\n\n   Sensors --+--&gt; [Input FE: tsfresh] --+\n             |                          |\n             |                          v\n             |                 +------------------+\n             |                 | Model A: GBM     |--&gt; wafer_mean\n             |                 | (scalar target)  |\n             |                 +------------------+\n             |\n             +--&gt; [Output FE: Zernike] --+\n                                          v\n                                 +------------------+\n                                 | Model B: MTGP    |--&gt; c1...c5\n                                 | (coefficients)   |    + uncertainty\n                                 +------------------+\n                                          |\n                                          v\n                              +---------------------+\n                              | Ensemble + physical |--&gt; 13 sites\n                              | reconstruction      |    + WIWNU\n                              +---------------------+\n                                          |\n                                          v\n                                  Cross-check with\n                                  Model C: direct WIWNU<\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Key concepts and references<\/strong>:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Direct vs derived WIWNU comparison \u2014 Rothe 2025<\/li>\n\n\n\n<li>GP + GBM stacking for CMP VM \u2014 Cai 2021<\/li>\n\n\n\n<li>Adaptive active learning with MTGP for CVD \u2014 Cai 2022<\/li>\n\n\n\n<li>Hierarchical decomposition with per-level models \u2014 Dwivedi 2023<\/li>\n\n\n\n<li>Ensemble with Shapley interpretation \u2014 Liu 2025<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Taxonomy Usage Guideline<\/h2>\n\n\n<style>.kadence-column6409_732add-c8 > .kt-inside-inner-col,.kadence-column6409_732add-c8 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_732add-c8 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_732add-c8 > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_732add-c8 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_732add-c8 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_732add-c8{position:relative;}.kadence-column6409_732add-c8, .kt-inside-inner-col > .kadence-column6409_732add-c8:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_732add-c8 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_732add-c8 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_732add-c8\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\">Real-world papers combine two or three primitives rather than belonging cleanly to one category:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Zhang 2011 (Virtual Probe) = <strong>A<\/strong> (DCT basis) + <strong>D<\/strong> (sparse recovery)<\/li>\n\n\n\n<li>Cai 2020 (CMP MTGP) = <strong>B<\/strong> (MTGP) + <strong>C<\/strong> (reference-based hierarchy)<\/li>\n\n\n\n<li>Go 2025 (PEB PINN) = <strong>E<\/strong> (physics) + <strong>F<\/strong> (multi-output field)<\/li>\n\n\n\n<li>Rothe 2025 (CMP VM) = <strong>C<\/strong> (product factor hierarchy) + <strong>F<\/strong> (multi-site) + <strong>G<\/strong> (direct-vs-derived ensemble)<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">When reviewing a paper, the productive question is: <strong>what is the primary primitive, and which secondary primitives are combined with it?<\/strong> This framing reveals each paper&#8217;s true contribution and makes cross-paper comparison meaningful.<\/p>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Four-Axis Classification for Paper Review<\/h2>\n\n\n<style>.kadence-column6409_ae733d-ac > .kt-inside-inner-col,.kadence-column6409_ae733d-ac > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_ae733d-ac > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_ae733d-ac > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_ae733d-ac > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_ae733d-ac > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_ae733d-ac{position:relative;}.kadence-column6409_ae733d-ac, .kt-inside-inner-col > .kadence-column6409_ae733d-ac:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_ae733d-ac > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_ae733d-ac > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_ae733d-ac\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\">Each paper can additionally be indexed along four orthogonal axes:<\/p>\n\n\n\n$$\\mathbf{x}_{paper} = [\\ \\text{InputFE},\\ \\text{OutputFE},\\ \\text{InductiveBias},\\ \\text{Model}\\ ]$$\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Input FE<\/strong>: sensor-side feature engineering (tsfresh, statistical features, FFT, wavelets, PCA on sensors, step-wise segmentation, chamber context)<\/li>\n\n\n\n<li><strong>Output FE<\/strong>: target-side representation (raw sites, wafer-mean-residual, reference-site-residual, Zernike coefficients, PCA scores, zone aggregates, WIWNU scalar)<\/li>\n\n\n\n<li><strong>Inductive Bias<\/strong>: spatial structure encoding (location as categorical, polar coordinates $(r, \\theta)$, spatial kernel, graph structure, 2D wafer map, PDE constraint)<\/li>\n\n\n\n<li><strong>Model<\/strong>: predictor algorithm (Ridge, Lasso, PLS, SVR, GP, MTGP, Random Forest, XGBoost, LightGBM, CatBoost, MLP, 1D-CNN, Transformer, GNN, ensembles)<\/li>\n<\/ul>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">Summary<\/h2>\n\n\n<style>.kadence-column6409_6f9555-bb > .kt-inside-inner-col,.kadence-column6409_6f9555-bb > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column6409_6f9555-bb > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column6409_6f9555-bb > .kt-inside-inner-col{flex-direction:column;}.kadence-column6409_6f9555-bb > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column6409_6f9555-bb > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column6409_6f9555-bb{position:relative;}.kadence-column6409_6f9555-bb, .kt-inside-inner-col > .kadence-column6409_6f9555-bb:not(.specificity){margin-left:var(--global-kb-spacing-md, 2rem);}@media all and (max-width: 1024px){.kadence-column6409_6f9555-bb > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column6409_6f9555-bb > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column6409_6f9555-bb\"><div class=\"kt-inside-inner-col\">\n<p class=\"wp-block-paragraph\">The seven categories (A\u2013G) form a complete primitive set for WIW-ML research. Categories A\u2013F represent distinct methodological innovations; category G represents the combinatorial space in which most production-grade systems live. The four-axis classification disambiguates where a paper&#8217;s novelty actually resides, which is frequently on the Output-FE or Inductive-Bias axes rather than on the Model axis. For researchers entering this field, reading six foundational papers \u2014 Zhang 2011, Bonilla 2008, Cai 2020, Dwivedi 2023, Rothe 2025, Shintani 2021 \u2014 provides full coverage of the landscape.<\/p>\n<\/div><\/div>\n\n\n\n<h2 class=\"wp-block-heading\">References<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Ahmadi 2015 \u2014 Ahmadi, A., et al., &#8220;Joint exploration of multiple test items&#8217; spatial patterns via compressed sensing,&#8221; IEEE Transactions on Semiconductor Manufacturing, 2015.<\/li>\n\n\n\n<li>Bonilla 2008 \u2014 Bonilla, E. V., Chai, K. M. A., and Williams, C. K. I., &#8220;Multi-task Gaussian process prediction,&#8221; Advances in Neural Information Processing Systems 20, 2008.<\/li>\n\n\n\n<li>Cai 2020 \u2014 Cai, H., Feng, J., Yang, Q., Li, W., Li, X., and Lee, J., &#8220;A virtual metrology method with prediction uncertainty based on Gaussian process for chemical mechanical planarization,&#8221; Computers in Industry, 2020.<\/li>\n\n\n\n<li>Cai 2021 \u2014 Cai, H., et al., &#8220;Reference-based virtual metrology method with uncertainty evaluation for material removal rate prediction based on Gaussian process regression,&#8221; 2021.<\/li>\n\n\n\n<li>Cai 2022 \u2014 Cai, H., et al., &#8220;An improved virtual metrology method in chemical vapor deposition systems via multi-task Gaussian processes and adaptive active learning,&#8221; International Journal of Advanced Manufacturing Technology, 2022.<\/li>\n\n\n\n<li>Cressie 1993 \u2014 Cressie, N. A. C., &#8220;Statistics for Spatial Data,&#8221; Wiley, 1993.<\/li>\n\n\n\n<li>Dorogush 2018 \u2014 Dorogush, A. V., Ershov, V., and Gulin, A., &#8220;CatBoost: gradient boosting with categorical features support,&#8221; arXiv:1810.11363, 2018.<\/li>\n\n\n\n<li>Dwivedi 2023 \u2014 Dwivedi, S., et al., &#8220;Capturing the effects of spatial process variations in silicon photonic circuits,&#8221; ACS Photonics, 2023.<\/li>\n\n\n\n<li>Go 2025 \u2014 Go, J., et al., &#8220;Real-time monitoring of thermoelastic deformation of a silicon wafer with sparse measurements in the photolithography process using a physics-informed neural network and Fourier neural operator,&#8221; Engineering Applications of Artificial Intelligence, 2025.<\/li>\n\n\n\n<li>Han 2025 \u2014 Han, T., et al., &#8220;Physics-Informed Neural Networks for Semiconductor Film Deposition: A Review,&#8221; arXiv:2507.10983, 2025.<\/li>\n\n\n\n<li>He 2018 \u2014 He, J., and Zhu, Y., &#8220;Hierarchical multi-task learning with application to wafer quality prediction,&#8221; 2018.<\/li>\n\n\n\n<li>Kazemi 2020 \u2014 Kazemi, P., et al., &#8220;Adaptive neural-based PCA framework for fault detection and diagnosis in time-varying industrial processes,&#8221; 2020.<\/li>\n\n\n\n<li>Kendall 2018 \u2014 Kendall, A., Gal, Y., and Cipolla, R., &#8220;Multi-task learning using uncertainty to weigh losses for scene geometry and semantics,&#8221; CVPR 2018.<\/li>\n\n\n\n<li>Kim 2025 \u2014 Kim, S., et al., &#8220;A neural master equation framework for multiscale modeling of molecular processes: application to atomic-scale plasma processes,&#8221; npj Computational Materials, 2025.<\/li>\n\n\n\n<li>Li 2022 \u2014 Li, X., et al., &#8220;CFD and ANN hybrid modeling for ALD SiO2 deposition,&#8221; 2022.<\/li>\n\n\n\n<li>Liu 2022 \u2014 Liu, Y., et al., &#8220;Mixed-effect profile monitoring for wafer thickness in industrial wafer slicing,&#8221; 2022.<\/li>\n\n\n\n<li>Liu 2025 \u2014 Liu, Y.-Y., Wang, Y.-C., Hsu, W.-C., Lin, C.-H., and Chang, K.-H., &#8220;An empirical study on enhancing wafer quality: Integrating big data and AI in virtual metrology for thin-film processing,&#8221; ScienceDirect, 2025.<\/li>\n\n\n\n<li>Noh 2018 \u2014 Noh, H., et al., &#8220;Zernike polynomial modeling for wafer-level overlay correction in APC,&#8221; 2018.<\/li>\n\n\n\n<li>Park 2018 \u2014 Park, C., et al., &#8220;Multitask learning for virtual metrology in semiconductor manufacturing systems,&#8221; Computers &amp; Industrial Engineering, 2018.<\/li>\n\n\n\n<li>Reda 2010 \u2014 Reda, S., and Nassif, S. R., &#8220;Accurate spatial estimation and decomposition techniques for variability characterization,&#8221; IEEE Transactions on Semiconductor Manufacturing, vol. 23, no. 3, pp. 345\u2013357, 2010.<\/li>\n\n\n\n<li>Rothe 2025 \u2014 Rothe, T., Lauff, A., Thieme, P., Langer, J., G\u00fcnther, M., and Kuhn, H., &#8220;Process data-driven machine learning for non-uniformity prediction and virtual metrology in chemical mechanical planarization,&#8221; Journal of Intelligent Manufacturing, 2025.<\/li>\n\n\n\n<li>Schirru 2011 \u2014 Schirru, A., Pampuri, S., and De Nicolao, G., &#8220;Multilevel kernel methods for virtual metrology in semiconductor manufacturing,&#8221; IFAC Proceedings, 2011.<\/li>\n\n\n\n<li>Shintani 2021 \u2014 Shintani, M., Mian, R.-U.-H., Inoue, M., Nakamura, T., Kajiyama, M., and Eiki, M., &#8220;Wafer-level variation modeling for multi-site RF IC testing via hierarchical Gaussian process,&#8221; arXiv:2111.01369, 2021.<\/li>\n\n\n\n<li>Zhang 2010 \u2014 Zhang, W., Li, X., and Rutenbar, R. A., &#8220;Bayesian virtual probe: Minimizing variation characterization cost for nanoscale IC technologies via Bayesian inference,&#8221; DAC 2010.<\/li>\n\n\n\n<li>Zhang 2011 \u2014 Zhang, W., Li, X., Liu, F., Acar, E., Rutenbar, R. A., and Blanton, R. D., &#8220;Virtual probe: A statistical framework for low-cost silicon characterization of nanoscale integrated circuits,&#8221; IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems, vol. 30, no. 12, pp. 1814\u20131827, 2011.<\/li>\n\n\n\n<li>Zhang 2012 \u2014 Zhang, W., et al., &#8220;Multi-Wafer Virtual Probe: Minimum-cost variation characterization by exploring wafer-to-wafer correlation,&#8221; 2012.<\/li>\n\n\n\n<li>Zhang 2014 \u2014 Zhang, W., et al., &#8220;Joint Virtual Probe: Joint exploration of multiple test items&#8217; spatial patterns for efficient silicon characterization,&#8221; 2014.<\/li>\n\n\n\n<li>Zhang 2024 \u2014 Zhang, Y., et al., &#8220;Ant Colony Optimization and Back Propagation Neural Network for 4H-SiC CVD epitaxy uniformity optimization,&#8221; 2024.<\/li>\n<\/ul>\n<div style='text-align:center' class='yasr-auto-insert-overall'><\/div><div style='text-align:center' class='yasr-auto-insert-visitor'><\/div>","protected":false},"excerpt":{"rendered":"<p>the far side of the Moon This report presents a structured taxonomy of machine learning methodologies specialized for Within-Wafer (WIW) variation prediction in semiconductor manufacturing. General-purpose ML approaches often fail to exploit the unique spatial structure of wafer data: circular geometry, process-induced radial symmetry, and strong inter-site correlations. The seven categories below organize WIW-specific methods&#8230;<\/p>\n","protected":false},"author":4,"featured_media":6399,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_bbp_topic_count":0,"_bbp_reply_count":0,"_bbp_total_topic_count":0,"_bbp_total_reply_count":0,"_bbp_voice_count":0,"_bbp_anonymous_reply_count":0,"_bbp_topic_count_hidden":0,"_bbp_reply_count_hidden":0,"_bbp_forum_subforum_count":0,"_kadence_starter_templates_imported_post":false,"_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","yasr_overall_rating":0,"yasr_post_is_review":"","yasr_auto_insert_disabled":"","yasr_review_type":"","fifu_image_url":"","fifu_image_alt":"","iawp_total_views":0,"footnotes":""},"categories":[56,18,4],"tags":[],"class_list":["post-6409","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-data-science-slug","category-ai-powered-slug","category-semiconductor-slug"],"yasr_visitor_votes":{"stars_attributes":{"read_only":false,"span_bottom":false},"number_of_votes":1,"sum_votes":4},"jetpack_featured_media_url":"https:\/\/ykim.synology.me\/wordpress\/wp-content\/uploads\/2026\/04\/Moon_Farside_LRO1024px.jpg","_links":{"self":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6409","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/users\/4"}],"replies":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/comments?post=6409"}],"version-history":[{"count":10,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6409\/revisions"}],"predecessor-version":[{"id":6437,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/posts\/6409\/revisions\/6437"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/media\/6399"}],"wp:attachment":[{"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/media?parent=6409"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/categories?post=6409"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ykim.synology.me\/wordpress\/wp-json\/wp\/v2\/tags?post=6409"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}