1 /***********************************************************************
2  * Copyright (c) 2013-2024 Commonwealth Computer Research, Inc.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Apache License, Version 2.0
5  * which accompanies this distribution and is available at
6  * http://www.opensource.org/licenses/apache2.0.php.
7  ***********************************************************************/
8 
9 package org.locationtech.geomesa.index.stats
10 
11 import org.geotools.api.feature.`type`.AttributeDescriptor
12 import org.geotools.api.feature.simple.{SimpleFeature, SimpleFeatureType}
13 import org.geotools.api.filter.Filter
14 import org.geotools.geometry.jts.ReferencedEnvelope
15 import org.geotools.util.factory.Hints
16 import org.locationtech.geomesa.curve.TimePeriod.TimePeriod
17 import org.locationtech.geomesa.filter.visitor.BoundsFilterVisitor
18 import org.locationtech.geomesa.index.stats.GeoMesaStats.GeoMesaStatWriter
19 import org.locationtech.geomesa.utils.geotools._
20 import org.locationtech.geomesa.utils.stats._
21 import org.locationtech.jts.geom.Geometry
22 
23 import java.io.{Closeable, Flushable}
24 import java.util.Date
25 
26 /**
27  * Tracks stats for a schema - spatial/temporal bounds, number of records, etc. Persistence of
28  * stats is not part of this trait, as different implementations will likely have different method signatures.
29  */
30 trait GeoMesaStats extends Closeable {
31 
32   import org.locationtech.geomesa.utils.geotools.RichSimpleFeatureType.RichSimpleFeatureType
33 
34   /**
35     * Gets a writer for updating stats
36     *
37     * @return
38     */
39   def writer: GeoMesaStatWriter
40 
41   /**
42     * Gets the number of features that will be returned for a query. May return -1 if exact is false
43     * and estimate is unavailable.
44     *
45     * @param sft simple feature type
46     * @param filter cql filter
47     * @param exact rough estimate, or precise count. note: precise count will likely be expensive.
48    *  @param queryHints query hints that should be used for query execution
49     * @return count of features, if available - will always be Some if exact == true
50     */
51   def getCount(sft: SimpleFeatureType, filter: Filter = Filter.INCLUDE, exact: Boolean = false, queryHints: Hints = new Hints()): Option[Long]
52 
53   /**
54     * Get the bounds for data that will be returned for a query
55     *
56     * @param sft simple feature type
57     * @param filter cql filter
58     * @param exact rough estimate, or precise bounds. note: precise bounds will likely be expensive.
59     * @return bounds
60     */
61   def getBounds(
62       sft: SimpleFeatureType,
63       filter: Filter = Filter.INCLUDE,
64       exact: Boolean = false): ReferencedEnvelope = {
65     val filterBounds = BoundsFilterVisitor.visit(filter)
66     Option(sft.getGeomField).flatMap(getMinMax[Geometry](sft, _, filter, exact)) match {
67       case None => filterBounds
68       case Some(bounds) =>
69         val env = bounds.min.getEnvelopeInternal
70         env.expandToInclude(bounds.max.getEnvelopeInternal)
71         filterBounds.intersection(env)
72     }
73   }
74 
75   /**
76     * Get the minimum and maximum values for the given attribute
77     *
78     * @param sft simple feature type
79     * @param attribute attribute name to examine
80     * @param filter cql filter
81     * @param exact rough estimate, or precise values. note: precise values will likely be expensive.
82     * @tparam T attribute type - must correspond to attribute binding
83     * @return mix/max values and overall cardinality. types will be consistent with the binding of the attribute
84     */
85   def getMinMax[T](
86       sft: SimpleFeatureType,
87       attribute: String,
88       filter: Filter = Filter.INCLUDE,
89       exact: Boolean = false): Option[MinMax[T]]
90 
91   /**
92     * Get an enumeration stat
93     *
94     * @param sft simple feature type
95     * @param attribute attribute name to query
96     * @param filter cql filter
97     * @param exact rough estimates, or precise values. note: precise values will likely be expensive.
98     * @tparam T attribute type - must correspond to attribute binding
99     * @return
100     */
101   def getEnumeration[T](
102       sft: SimpleFeatureType,
103       attribute: String,
104       filter: Filter = Filter.INCLUDE,
105       exact: Boolean = false): Option[EnumerationStat[T]]
106 
107   /**
108     * Get a frequency stat
109     *
110     * @param sft simple feature type
111     * @param attribute attribute name to query
112     * @param precision precision of the estimate - @see org.locationtech.geomesa.utils.stats.Frequency
113     * @param filter cql filter
114     * @param exact rough estimates, or precise values. note: precise values will likely be expensive.
115     * @tparam T attribute type - must correspond to attribute binding
116     * @return
117     */
118   def getFrequency[T](
119       sft: SimpleFeatureType,
120       attribute: String,
121       precision: Int,
122       filter: Filter = Filter.INCLUDE,
123       exact: Boolean = false): Option[Frequency[T]]
124 
125   /**
126     * Get a top k stat
127     *
128     * @param sft simple feature type
129     * @param attribute attribute name to query
130     * @param filter cql filter
131     * @param exact rough estimates, or precise values. note: precise values will likely be expensive.
132     * @tparam T attribute type - must correspond to attribute binding
133     * @return
134     */
135   def getTopK[T](
136       sft: SimpleFeatureType,
137       attribute: String,
138       filter: Filter = Filter.INCLUDE,
139       exact: Boolean = false): Option[TopK[T]]
140 
141   /**
142     * Get a histogram stat
143     *
144     * @param sft simple feature type
145     * @param attribute attribute name to query
146     * @param bins number of buckets used to group values
147     * @param min minimum value used to create the initial histogram buckets
148     * @param max maximum value used to create the initial histogram buckets
149     * @param filter cql filter
150     * @param exact rough estimates, or precise values. note: precise values will likely be expensive.
151     * @tparam T attribute type - must correspond to attribute binding
152     * @return
153     */
154   def getHistogram[T](
155       sft: SimpleFeatureType,
156       attribute: String,
157       bins: Int,
158       min: T,
159       max: T,
160       filter: Filter = Filter.INCLUDE,
161       exact: Boolean = false): Option[Histogram[T]]
162 
163   /**
164     * Get a Z3 histogram stat, where values are grouped based on combined geometry + date
165     *
166     * @param sft simple feature type
167     * @param geom geometry attribute to query
168     * @param dtg date attribute to query
169     * @param period time period used to calculate bins for each value
170     * @param bins number of buckets used to group values
171     * @param filter cql filter
172     * @param exact rough estimates, or precise values. note: precise values will likely be expensive.
173     * @return
174     */
175   def getZ3Histogram(
176       sft: SimpleFeatureType,
177       geom: String,
178       dtg: String,
179       period: TimePeriod,
180       bins: Int,
181       filter: Filter = Filter.INCLUDE,
182       exact: Boolean = false): Option[Z3Histogram]
183 
184   /**
185     * Gets arbitrary stats for multiple queries
186     *
187     * @param sft simple feature type
188     * @param queries stats strings
189     * @param filter cql filter
190     * @param exact rough estimate, or precise values. note: precise values will likely be expensive.
191     * @tparam T type bounds, must match stat query strings
192     * @return
193     */
194   def getSeqStat[T <: Stat](
195       sft: SimpleFeatureType,
196       queries: Seq[String],
197       filter: Filter = Filter.INCLUDE,
198       exact: Boolean = false): Seq[T] = {
199     if (queries.isEmpty) {
200       Seq.empty
201     } else if (queries.lengthCompare(1) == 0) {
202       getStat(sft, queries.head, filter, exact).toSeq
203     } else {
204       getStat[SeqStat](sft, Stat.SeqStat(queries), filter, exact) match {
205         case None    => Seq.empty
206         case Some(s) => s.stats.asInstanceOf[Seq[T]]
207       }
208     }
209   }
210 
211   /**
212     * Get arbitrary stats
213     *
214     * @param sft simple feature type
215     * @param query stats string
216     * @param filter cql filter
217     * @param exact rough estimate, or precise values. note: precise values will likely be expensive.
218     * @tparam T type bounds, must match stat query strings
219     * @return stats, if any
220     */
221   def getStat[T <: Stat](
222       sft: SimpleFeatureType,
223       query: String,
224       filter: Filter = Filter.INCLUDE,
225       exact: Boolean = false): Option[T]
226 }
227 
228 object GeoMesaStats {
229 
230   import org.locationtech.geomesa.utils.geotools.RichAttributeDescriptors.RichAttributeDescriptor
231 
232   // date bucket size in milliseconds for the date frequency - one day
233   val DateFrequencyPrecision: Int = 1000 * 60 * 60 * 24
234 
235   // how many buckets to sort each attribute into
236   // max space on disk = 8 bytes * size - we use optimized serialization so likely 1-3 bytes * size
237   // buckets up to ~2M values will take 3 bytes or less
238   val MaxHistogramSize: Int = 10000 // with ~1B records ~100k records per bin and ~29 kb on disk
239   val DefaultHistogramSize: Int = 1000
240 
241   val StatClasses: Seq[Class[_ <: AnyRef]] =
242     Seq(classOf[Geometry], classOf[String], classOf[Integer], classOf[java.lang.Long],
243       classOf[java.lang.Float], classOf[java.lang.Double], classOf[Date])
244 
245   /**
246     * Get the default bounds for a range histogram
247     *
248     * @param binding class type
249     * @tparam T class type
250     * @return bounds
251     */
252   def defaultBounds[T](binding: Class[T]): (T, T) = {
253     val default = binding match {
254       case b if b == classOf[String]                  => ""
255       case b if b == classOf[Integer]                 => 0
256       case b if b == classOf[java.lang.Long]          => 0L
257       case b if b == classOf[java.lang.Float]         => 0f
258       case b if b == classOf[java.lang.Double]        => 0d
259       case b if classOf[Date].isAssignableFrom(b)     => new Date()
260       case b if classOf[Geometry].isAssignableFrom(b) => GeometryUtils.zeroPoint
261       case _ => throw new NotImplementedError(s"Can't handle binding of type $binding")
262     }
263     Histogram.buffer(default.asInstanceOf[T])
264   }
265 
266   /**
267     * Gets the default precision for a frequency stat
268     *
269     * @param binding class type
270     * @return precision
271     */
272   def defaultPrecision(binding: Class[_]): Int = {
273     binding match {
274       case c if c == classOf[String]              => 20   // number of characters we will compare
275       case c if c == classOf[Integer]             => 1    // size of a 'bin'
276       case c if c == classOf[java.lang.Long]      => 1    // size of a 'bin'
277       case c if c == classOf[java.lang.Float]     => 1000 // 10 ^ decimal places we'll keep
278       case c if c == classOf[java.lang.Double]    => 1000 // 10 ^ decimal places we'll keep
279       case c if classOf[Date].isAssignableFrom(c) => 1000 * 60 * 60 // size of a 'bin' - one hour
280       case c => throw new NotImplementedError(s"Can't handle binding of type $c")
281     }
282   }
283 
284   // determines if it is possible to run a min/max and histogram on the attribute
285   // TODO GEOMESA-1217 support list/maps in stats
286   def okForStats(d: AttributeDescriptor): Boolean =
287     !d.isMultiValued && StatClasses.exists(_.isAssignableFrom(d.getType.getBinding))
288 
289   /**
290     * Trait for writing/updating stats
291     */
292   trait GeoMesaStatWriter {
293 
294     /**
295       * Updates the persisted stats for the given schema
296       *
297       * @param sft simple feature type
298       */
299     def analyze(sft: SimpleFeatureType): Seq[Stat]
300 
301     /**
302       * Gets an object to track stats as they are written
303       *
304       * @param sft simple feature type
305       * @return updater
306       */
307     def updater(sft: SimpleFeatureType): StatUpdater
308 
309     /**
310       * Renames a schema and/or attributes
311       *
312       * @param sft simple feature type
313       * @param previous old feature type to migrate
314       */
315     def rename(sft: SimpleFeatureType, previous: SimpleFeatureType): Unit
316 
317     /**
318       * Deletes any stats associated with the given schema
319       *
320       * @param sft simple feature type
321       */
322     def clear(sft: SimpleFeatureType): Unit
323   }
324 
325   /**
326     * Trait for tracking stats based on simple features
327     */
328   trait StatUpdater extends Closeable with Flushable {
329     def add(sf: SimpleFeature): Unit
330     def remove(sf: SimpleFeature): Unit
331   }
332 }
Line Stmt Id Pos Tree Symbol Tests Code
65 42362 2690 - 2723 Apply org.locationtech.geomesa.filter.visitor.BoundsFilterVisitor.visit org.locationtech.geomesa.filter.visitor.BoundsFilterVisitor.visit(filter, org.locationtech.geomesa.filter.visitor.BoundsFilterVisitor.visit$default$2)
66 42363 2735 - 2751 Select org.locationtech.geomesa.utils.geotools.RichSimpleFeatureType.RichSimpleFeatureType.getGeomField org.locationtech.geomesa.utils.geotools.RichSimpleFeatureType.RichSimpleFeatureType(sft).getGeomField
66 42364 2761 - 2803 Apply org.locationtech.geomesa.index.stats.GeoMesaStats.getMinMax GeoMesaStats.this.getMinMax[org.locationtech.jts.geom.Geometry](sft, x$1, filter, exact)
66 42365 2728 - 2804 Apply scala.Option.flatMap scala.Option.apply[String](org.locationtech.geomesa.utils.geotools.RichSimpleFeatureType.RichSimpleFeatureType(sft).getGeomField).flatMap[org.locationtech.geomesa.utils.stats.MinMax[org.locationtech.jts.geom.Geometry]](((x$1: String) => GeoMesaStats.this.getMinMax[org.locationtech.jts.geom.Geometry](sft, x$1, filter, exact)))
67 42366 2832 - 2844 Ident org.locationtech.geomesa.index.stats.GeoMesaStats.filterBounds filterBounds
68 42371 2869 - 3019 Block <nosymbol> { val env: org.locationtech.jts.geom.Envelope = bounds.min.getEnvelopeInternal(); env.expandToInclude(bounds.max.getEnvelopeInternal()); filterBounds.intersection(env) }
69 42367 2890 - 2920 Apply org.locationtech.jts.geom.Geometry.getEnvelopeInternal bounds.min.getEnvelopeInternal()
70 42368 2949 - 2979 Apply org.locationtech.jts.geom.Geometry.getEnvelopeInternal bounds.max.getEnvelopeInternal()
70 42369 2929 - 2980 Apply org.locationtech.jts.geom.Envelope.expandToInclude env.expandToInclude(bounds.max.getEnvelopeInternal())
71 42370 2989 - 3019 Apply org.geotools.geometry.jts.ReferencedEnvelope.intersection filterBounds.intersection(env)
199 42372 7376 - 7391 Select scala.collection.SeqLike.isEmpty queries.isEmpty
200 42373 7401 - 7410 TypeApply scala.collection.generic.GenericCompanion.empty scala.collection.Seq.empty[Nothing]
200 42374 7401 - 7410 Block scala.collection.generic.GenericCompanion.empty scala.collection.Seq.empty[Nothing]
201 42375 7426 - 7455 Apply scala.Int.== queries.lengthCompare(1).==(0)
201 42387 7422 - 7700 If <nosymbol> if (queries.lengthCompare(1).==(0)) scala.this.Option.option2Iterable[Nothing](GeoMesaStats.this.getStat[Nothing](sft, queries.head, filter, exact)).toSeq else GeoMesaStats.this.getStat[org.locationtech.geomesa.utils.stats.SeqStat](sft, org.locationtech.geomesa.utils.stats.Stat.SeqStat(queries), filter, exact) match { case scala.None => scala.collection.Seq.empty[Nothing] case (value: org.locationtech.geomesa.utils.stats.SeqStat)Some[org.locationtech.geomesa.utils.stats.SeqStat]((s @ _)) => s.stats.asInstanceOf[Seq[T]] }
202 42376 7478 - 7490 Select scala.collection.IterableLike.head queries.head
202 42377 7465 - 7506 Apply org.locationtech.geomesa.index.stats.GeoMesaStats.getStat GeoMesaStats.this.getStat[Nothing](sft, queries.head, filter, exact)
202 42378 7465 - 7512 Select scala.collection.TraversableOnce.toSeq scala.this.Option.option2Iterable[Nothing](GeoMesaStats.this.getStat[Nothing](sft, queries.head, filter, exact)).toSeq
202 42379 7465 - 7512 Block scala.collection.TraversableOnce.toSeq scala.this.Option.option2Iterable[Nothing](GeoMesaStats.this.getStat[Nothing](sft, queries.head, filter, exact)).toSeq
204 42380 7554 - 7575 Apply org.locationtech.geomesa.utils.stats.Stat.SeqStat org.locationtech.geomesa.utils.stats.Stat.SeqStat(queries)
204 42381 7532 - 7591 Apply org.locationtech.geomesa.index.stats.GeoMesaStats.getStat GeoMesaStats.this.getStat[org.locationtech.geomesa.utils.stats.SeqStat](sft, org.locationtech.geomesa.utils.stats.Stat.SeqStat(queries), filter, exact)
204 42386 7532 - 7694 Match <nosymbol> GeoMesaStats.this.getStat[org.locationtech.geomesa.utils.stats.SeqStat](sft, org.locationtech.geomesa.utils.stats.Stat.SeqStat(queries), filter, exact) match { case scala.None => scala.collection.Seq.empty[Nothing] case (value: org.locationtech.geomesa.utils.stats.SeqStat)Some[org.locationtech.geomesa.utils.stats.SeqStat]((s @ _)) => s.stats.asInstanceOf[Seq[T]] }
205 42382 7624 - 7633 TypeApply scala.collection.generic.GenericCompanion.empty scala.collection.Seq.empty[Nothing]
205 42383 7624 - 7633 Block scala.collection.generic.GenericCompanion.empty scala.collection.Seq.empty[Nothing]
206 42384 7658 - 7686 TypeApply scala.Any.asInstanceOf s.stats.asInstanceOf[Seq[T]]
206 42385 7658 - 7686 Block scala.Any.asInstanceOf s.stats.asInstanceOf[Seq[T]]
233 42388 8428 - 8447 Literal <nosymbol> 86400000
238 42389 8685 - 8690 Literal <nosymbol> 10000
239 42390 8786 - 8790 Literal <nosymbol> 1000
242 42391 8841 - 8997 Apply scala.collection.generic.GenericCompanion.apply scala.collection.Seq.apply[Class[_ >: java.util.Date with Double with Float with Long with Integer with String with org.locationtech.jts.geom.Geometry <: Comparable[_ >: java.util.Date with Double with Float with Long with Integer with String with T] with java.io.Serializable]](classOf[org.locationtech.jts.geom.Geometry], classOf[java.lang.String], classOf[java.lang.Integer], classOf[java.lang.Long], classOf[java.lang.Float], classOf[java.lang.Double], classOf[java.util.Date])
254 42392 9253 - 9273 Apply java.lang.Object.== b.==(classOf[java.lang.String])
254 42393 9294 - 9296 Literal <nosymbol> ""
254 42394 9294 - 9296 Block <nosymbol> ""
255 42395 9313 - 9334 Apply java.lang.Object.== b.==(classOf[java.lang.Integer])
255 42396 9354 - 9355 Literal <nosymbol> 0
255 42397 9354 - 9355 Block <nosymbol> 0
256 42398 9372 - 9400 Apply java.lang.Object.== b.==(classOf[java.lang.Long])
256 42399 9413 - 9415 Literal <nosymbol> 0L
256 42400 9413 - 9415 Block <nosymbol> 0L
257 42401 9432 - 9461 Apply java.lang.Object.== b.==(classOf[java.lang.Float])
257 42402 9473 - 9475 Literal <nosymbol> 0.0
257 42403 9473 - 9475 Block <nosymbol> 0.0
258 42404 9492 - 9522 Apply java.lang.Object.== b.==(classOf[java.lang.Double])
258 42405 9533 - 9535 Literal <nosymbol> 0.0
258 42406 9533 - 9535 Block <nosymbol> 0.0
259 42407 9552 - 9585 Apply java.lang.Class.isAssignableFrom classOf[java.util.Date].isAssignableFrom(b)
259 42408 9593 - 9603 Apply java.util.Date.<init> new java.util.Date()
259 42409 9593 - 9603 Block java.util.Date.<init> new java.util.Date()
260 42410 9620 - 9657 Apply java.lang.Class.isAssignableFrom classOf[org.locationtech.jts.geom.Geometry].isAssignableFrom(b)
260 42411 9661 - 9684 Select org.locationtech.geomesa.utils.geotools.GeometryUtils.zeroPoint org.locationtech.geomesa.utils.geotools.GeometryUtils.zeroPoint
260 42412 9661 - 9684 Block org.locationtech.geomesa.utils.geotools.GeometryUtils.zeroPoint org.locationtech.geomesa.utils.geotools.GeometryUtils.zeroPoint
261 42413 9701 - 9772 Throw <nosymbol> throw new scala.NotImplementedError(scala.StringContext.apply("Can\'t handle binding of type ", "").s(binding))
261 42414 9701 - 9772 Block <nosymbol> throw new scala.NotImplementedError(scala.StringContext.apply("Can\'t handle binding of type ", "").s(binding))
263 42415 9800 - 9823 TypeApply scala.Any.asInstanceOf default.asInstanceOf[T]
263 42416 9783 - 9824 Apply org.locationtech.geomesa.utils.stats.Histogram.buffer org.locationtech.geomesa.utils.stats.Histogram.buffer[T](default.asInstanceOf[T])
274 42417 10046 - 10066 Apply java.lang.Object.== c.==(classOf[java.lang.String])
274 42418 10083 - 10085 Literal <nosymbol> 20
274 42419 10083 - 10085 Block <nosymbol> 20
275 42420 10144 - 10165 Apply java.lang.Object.== c.==(classOf[java.lang.Integer])
275 42421 10181 - 10182 Literal <nosymbol> 1
275 42422 10181 - 10182 Block <nosymbol> 1
276 42423 10221 - 10249 Apply java.lang.Object.== c.==(classOf[java.lang.Long])
276 42424 10258 - 10259 Literal <nosymbol> 1
276 42425 10258 - 10259 Block <nosymbol> 1
277 42426 10298 - 10327 Apply java.lang.Object.== c.==(classOf[java.lang.Float])
277 42427 10335 - 10339 Literal <nosymbol> 1000
277 42428 10335 - 10339 Block <nosymbol> 1000
278 42429 10390 - 10420 Apply java.lang.Object.== c.==(classOf[java.lang.Double])
278 42430 10427 - 10431 Literal <nosymbol> 1000
278 42431 10427 - 10431 Block <nosymbol> 1000
279 42432 10482 - 10515 Apply java.lang.Class.isAssignableFrom classOf[java.util.Date].isAssignableFrom(c)
279 42433 10519 - 10533 Literal <nosymbol> 3600000
279 42434 10519 - 10533 Block <nosymbol> 3600000
280 42435 10580 - 10645 Throw <nosymbol> throw new scala.NotImplementedError(scala.StringContext.apply("Can\'t handle binding of type ", "").s(c))
280 42436 10580 - 10645 Block <nosymbol> throw new scala.NotImplementedError(scala.StringContext.apply("Can\'t handle binding of type ", "").s(c))
287 42437 10903 - 10923 Apply org.geotools.api.feature.type.PropertyType.getBinding d.getType().getBinding()
287 42438 10884 - 10924 Apply java.lang.Class.isAssignableFrom x$2.isAssignableFrom(d.getType().getBinding())
287 42439 10865 - 10925 Apply scala.collection.IterableLike.exists GeoMesaStats.this.StatClasses.exists(((x$2: Class[_ <: AnyRef]) => x$2.isAssignableFrom(d.getType().getBinding())))
287 42440 10845 - 10925 Apply scala.Boolean.&& org.locationtech.geomesa.utils.geotools.RichAttributeDescriptors.RichAttributeDescriptor(d).isMultiValued.unary_!.&&(GeoMesaStats.this.StatClasses.exists(((x$2: Class[_ <: AnyRef]) => x$2.isAssignableFrom(d.getType().getBinding()))))