001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.statistics.descriptive; 019 020import java.util.Objects; 021import org.apache.commons.numbers.arrays.Selection; 022 023/** 024 * Returns the median of the available values. 025 * 026 * <p>For values of length {@code n}, let {@code k = n / 2}: 027 * <ul> 028 * <li>The result is {@code NaN} if {@code n = 0}.</li> 029 * <li>The result is {@code values[k]} if {@code n} is odd.</li> 030 * <li>The result is {@code (values[k - 1] + values[k]) / 2} if {@code n} is even.</li> 031 * </ul> 032 * 033 * <p>This implementation respects the ordering imposed by 034 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs 035 * in the selected positions in the fully sorted values then the result is {@code NaN}. 036 * 037 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values. 038 * 039 * <p>Instances of this class are immutable and thread-safe. 040 * 041 * <p><strong>Support for {@code long} arrays</strong> 042 * 043 * <p>The result on {@code long} values can be returned as a {@code double} or 044 * a {@code long} using a {@link StatisticResult}. 045 * 046 * <p>The {@code double} result is the closest representable value 047 * following floating-point rounding rules. This may be outside the 048 * range defined by the minimum and maximum of the input array following 049 * rounding to a 53-bit floating point representation. 050 * For example the median of an array containing only {@link Long#MAX_VALUE} 051 * as a {@code double} is 2<sup>63</sup>, which is the closest representation of 052 * 2<sup>63</sup> - 1. 053 * 054 * <p>The {@code long} result is returned using the nearest whole number. 055 * In the event of ties the result is rounded towards positive infinity. 056 * For example the median of {@code [2, 3]} is 3, and of {@code [-2, -3]} is -2. 057 * This value will always be within the range defined by the minimum and maximum 058 * of the input array. Due to interpolation it may be a value not observed in 059 * the input values. 060 * 061 * <p>If the array length {@code n} is zero the result as a {@code double} is 062 * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}. 063 * 064 * @see #with(NaNPolicy) 065 * @see <a href="https://en.wikipedia.org/wiki/Median">Median (Wikipedia)</a> 066 * @since 1.1 067 */ 068public final class Median { 069 /** Default instance. */ 070 private static final Median DEFAULT = new Median(false, NaNPolicy.INCLUDE); 071 072 /** Flag to indicate if the data should be copied. */ 073 private final boolean copy; 074 /** NaN policy for floating point data. */ 075 private final NaNPolicy nanPolicy; 076 /** Transformer for NaN data. */ 077 private final NaNTransformer nanTransformer; 078 079 /** 080 * @param copy Flag to indicate if the data should be copied. 081 * @param nanPolicy NaN policy. 082 */ 083 private Median(boolean copy, NaNPolicy nanPolicy) { 084 this.copy = copy; 085 this.nanPolicy = nanPolicy; 086 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy); 087 } 088 089 /** 090 * Return a new instance with the default options. 091 * 092 * <ul> 093 * <li>{@linkplain #withCopy(boolean) Copy = false}</li> 094 * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li> 095 * </ul> 096 * 097 * <p>Note: The default options configure for processing in-place and including 098 * {@code NaN} values in the data. This is the most efficient mode and has the 099 * smallest memory consumption. 100 * 101 * @return the median implementation 102 * @see #withCopy(boolean) 103 * @see #with(NaNPolicy) 104 */ 105 public static Median withDefaults() { 106 return DEFAULT; 107 } 108 109 /** 110 * Return an instance with the configured copy behaviour. If {@code false} then 111 * the input array will be modified by the call to evaluate the median; otherwise 112 * the computation uses a copy of the data. 113 * 114 * @param v Value. 115 * @return an instance 116 */ 117 public Median withCopy(boolean v) { 118 return new Median(v, nanPolicy); 119 } 120 121 /** 122 * Return an instance with the configured {@link NaNPolicy}. 123 * 124 * <p>Note: This implementation respects the ordering imposed by 125 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is 126 * considered greater than all other values, and all {@code NaN} values are equal. The 127 * {@link NaNPolicy} changes the computation of the statistic in the presence of 128 * {@code NaN} values. 129 * 130 * <ul> 131 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data; 132 * the size of the data <em>includes</em> the {@code NaN} values and the median will be 133 * {@code NaN} if any value used for median interpolation is {@code NaN}.</li> 134 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data; 135 * the size of the data <em>excludes</em> the {@code NaN} values and the median will 136 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero 137 * and the result is {@code NaN}.</li> 138 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN} 139 * values.</li> 140 * </ul> 141 * 142 * <p>Note that the result is identical for all policies if no {@code NaN} values are present. 143 * 144 * @param v Value. 145 * @return an instance 146 */ 147 public Median with(NaNPolicy v) { 148 return new Median(copy, Objects.requireNonNull(v)); 149 } 150 151 /** 152 * Evaluate the median. 153 * 154 * <p>Note: This method may partially sort the input values if not configured to 155 * {@link #withCopy(boolean) copy} the input data. 156 * 157 * @param values Values. 158 * @return the median 159 * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 160 * @see #with(NaNPolicy) 161 */ 162 public double evaluate(double[] values) { 163 return compute(values, 0, values.length); 164 } 165 166 /** 167 * Evaluate the median of the specified range. 168 * 169 * <p>Note: This method may partially sort the input values if not configured to 170 * {@link #withCopy(boolean) copy} the input data. 171 * 172 * @param values Values. 173 * @param from Inclusive start of the range. 174 * @param to Exclusive end of the range. 175 * @return the median 176 * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 177 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 178 * @see #with(NaNPolicy) 179 * @since 1.2 180 */ 181 public double evaluateRange(double[] values, int from, int to) { 182 Statistics.checkFromToIndex(from, to, values.length); 183 return compute(values, from, to); 184 } 185 186 /** 187 * Compute the median of the specified range. 188 * 189 * @param values Values. 190 * @param from Inclusive start of the range. 191 * @param to Exclusive end of the range. 192 * @return the median 193 */ 194 private double compute(double[] values, int from, int to) { 195 // Floating-point data handling 196 final int[] bounds = new int[2]; 197 final double[] x = nanTransformer.apply(values, from, to, bounds); 198 final int start = bounds[0]; 199 final int end = bounds[1]; 200 final int n = end - start; 201 // Special cases 202 if (n <= 2) { 203 switch (n) { 204 case 2: 205 // Sorting the array matches the behaviour of Quantile for n==2 206 // Handle NaN and signed zeros 207 if (Double.compare(x[start + 1], x[start]) < 0) { 208 final double t = x[start]; 209 x[start] = x[start + 1]; 210 x[start + 1] = t; 211 } 212 return Interpolation.mean(x[start], x[start + 1]); 213 case 1: 214 return x[start]; 215 default: 216 return Double.NaN; 217 } 218 } 219 // Median index (including the offset) 220 final int m = (start + end) >>> 1; 221 // Odd 222 if ((n & 0x1) == 1) { 223 Selection.select(x, start, end, m); 224 return x[m]; 225 } 226 // Even: require (m-1, m) 227 Selection.select(x, start, end, new int[] {m - 1, m}); 228 return Interpolation.mean(x[m - 1], x[m]); 229 } 230 231 /** 232 * Evaluate the median. 233 * 234 * <p>Note: This method may partially sort the input values if not configured to 235 * {@link #withCopy(boolean) copy} the input data. 236 * 237 * @param values Values. 238 * @return the median 239 */ 240 public double evaluate(int[] values) { 241 return compute(values, 0, values.length); 242 } 243 244 /** 245 * Evaluate the median of the specified range. 246 * 247 * <p>Note: This method may partially sort the input values if not configured to 248 * {@link #withCopy(boolean) copy} the input data. 249 * 250 * @param values Values. 251 * @param from Inclusive start of the range. 252 * @param to Exclusive end of the range. 253 * @return the median 254 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 255 * @since 1.2 256 */ 257 public double evaluateRange(int[] values, int from, int to) { 258 Statistics.checkFromToIndex(from, to, values.length); 259 return compute(values, from, to); 260 } 261 262 /** 263 * Compute the median of the specified range. 264 * 265 * @param values Values. 266 * @param from Inclusive start of the range. 267 * @param to Exclusive end of the range. 268 * @return the median 269 */ 270 private double compute(int[] values, int from, int to) { 271 final int[] x; 272 final int start; 273 final int end; 274 if (copy) { 275 x = Statistics.copy(values, from, to); 276 start = 0; 277 end = x.length; 278 } else { 279 x = values; 280 start = from; 281 end = to; 282 } 283 final int n = end - start; 284 // Special cases 285 if (n <= 2) { 286 switch (n) { 287 case 2: 288 // Sorting the array matches the behaviour of Quantile for n==2 289 if (x[start + 1] < x[start]) { 290 final int t = x[start]; 291 x[start] = x[start + 1]; 292 x[start + 1] = t; 293 } 294 return Interpolation.mean(x[start], x[start + 1]); 295 case 1: 296 return x[start]; 297 default: 298 return Double.NaN; 299 } 300 } 301 // Median index (including the offset) 302 final int m = (start + end) >>> 1; 303 // Odd 304 if ((n & 0x1) == 1) { 305 Selection.select(x, start, end, m); 306 return x[m]; 307 } 308 // Even: require (m-1, m) 309 Selection.select(x, start, end, new int[] {m - 1, m}); 310 return Interpolation.mean(x[m - 1], x[m]); 311 } 312 313 314 /** 315 * Evaluate the median. 316 * 317 * <p>If the input length is even the result requires interpolation of two values. 318 * The returned median will interpolate the {@code double} or {@code long} result 319 * on demand. This is more efficient when only one result is required. Consumers 320 * of the result should store the appropriate evaluated value if repeated use is 321 * required. 322 * 323 * <p>Note: This method may partially sort the input values if not configured to 324 * {@link #withCopy(boolean) copy} the input data. 325 * 326 * @param values Values. 327 * @return the median 328 * @since 1.3 329 */ 330 public StatisticResult evaluate(long[] values) { 331 return compute(values, 0, values.length); 332 } 333 334 /** 335 * Evaluate the median of the specified range. 336 * 337 * <p>If the input sub-range length is even the result requires interpolation of two values. 338 * The returned median will interpolate the {@code double} or {@code long} result 339 * on demand. This is more efficient when only one result is required. Consumers 340 * of the result should store the appropriate evaluated value if repeated use is 341 * required. 342 * 343 * <p>Note: This method may partially sort the input values if not configured to 344 * {@link #withCopy(boolean) copy} the input data. 345 * 346 * @param values Values. 347 * @param from Inclusive start of the range. 348 * @param to Exclusive end of the range. 349 * @return the median 350 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 351 * @since 1.3 352 */ 353 public StatisticResult evaluateRange(long[] values, int from, int to) { 354 Statistics.checkFromToIndex(from, to, values.length); 355 return compute(values, from, to); 356 } 357 358 /** 359 * Compute the median of the specified range. 360 * 361 * @param values Values. 362 * @param from Inclusive start of the range. 363 * @param to Exclusive end of the range. 364 * @return the median 365 */ 366 private StatisticResult compute(long[] values, int from, int to) { 367 final long[] x; 368 final int start; 369 final int end; 370 if (copy) { 371 x = Statistics.copy(values, from, to); 372 start = 0; 373 end = x.length; 374 } else { 375 x = values; 376 start = from; 377 end = to; 378 } 379 final int n = end - start; 380 // Special cases 381 if (n <= 2) { 382 switch (n) { 383 case 2: 384 // Sorting the array matches the behaviour of Quantile for n==2 385 if (x[start + 1] < x[start]) { 386 final long t = x[start]; 387 x[start] = x[start + 1]; 388 x[start + 1] = t; 389 } 390 return Interpolation.mean(x[start], x[start + 1]); 391 case 1: 392 return Statistics.createStatisticResult(x[start]); 393 default: 394 return () -> Double.NaN; 395 } 396 } 397 // Median index (including the offset) 398 final int m = (start + end) >>> 1; 399 // Odd 400 if ((n & 0x1) == 1) { 401 Selection.select(x, start, end, m); 402 return Statistics.createStatisticResult(x[m]); 403 } 404 // Even: require (m-1, m) 405 Selection.select(x, start, end, new int[] {m - 1, m}); 406 return Interpolation.mean(x[m - 1], x[m]); 407 } 408}