Safe Haskell | Safe-Inferred |
---|---|
Language | GHC2021 |
Synopsis
- data AEDANumerical b a = Numerical {}
- data AEDANumberColumn
- = NumericalDouble { }
- | NumericalInt { }
- | NumericalDateTime { }
- | NumericalDate { }
- | NumericalTime { }
- data AEDAHistogramColumn
- = Categorical { }
- | Histogram { }
- | Time { }
- | Decile { }
- timeHistogramRan :: Lens' AEDAHistogramColumn UTCTime
- timeHistoGram :: Traversal' AEDAHistogramColumn (Text, Int)
- runTimeRelation :: Traversal' AEDAHistogramColumn Name
- runHistogramRelation :: Traversal' AEDAHistogramColumn Name
- runHistogramColumnName :: Lens' AEDAHistogramColumn Name
- runDecileRelation :: Traversal' AEDAHistogramColumn Name
- runDatePart :: Traversal' AEDAHistogramColumn Name
- runCategoricalRelation :: Traversal' AEDAHistogramColumn Name
- numHistogram :: Traversal' AEDAHistogramColumn (Text, Int)
- distinctHistogram :: Traversal' AEDAHistogramColumn (Text, Int)
- decileHistogram :: Traversal' AEDAHistogramColumn (Text, Double)
- decile :: Traversal' AEDAHistogramColumn Int
- class AdditionNumber a where
- addition :: a -> a -> a
- toUTCTime :: Int -> UTCTime
- fromUTCTime :: UTCTime -> Int
- timeToSeconds' :: TimeOfDay -> Int
- secondsToTime' :: Int -> TimeOfDay
- class AdditionNumber a => ZeroNumber a where
- additionIdentity :: a
- data TableDiagnostics = TableDiagnostics {}
- runRelation :: Lens' TableDiagnostics Relation
- runNumberColumns :: Lens' TableDiagnostics [AEDANumberColumn]
- runHistograms :: Lens' TableDiagnostics [AEDAHistogramColumn]
- runDiagnosticDate :: Lens' TableDiagnostics UTCTime
- runCategoricalHistograms :: Lens' TableDiagnostics [AEDAHistogramColumn]
- data NumericalStat
- data CategoricalStat
- data ColumnName = ColumnName (Ref Table) Name
- toColumnName :: Value -> Value -> Value -> Value -> ColumnName
- textValueToName :: Value -> Name
- data DecileGram
- data CountGram
- class Stats a where
- class Stats a => NumericalStats a where
- timeToSeconds :: SExp -> SExp
- secondsToTime :: SExp -> SExp
- data BucketArgument
- isDec :: BucketArgument -> Bool
- isDatePartition :: DatePart -> Bool
- isDayPartition :: DatePart -> Bool
- isTimePartition :: DatePart -> Bool
- class Stats a => Histogram a b where
- bucketStrategy :: a -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp)
- bucketSize :: a -> SExp -> SExp -> SExp
- additionalArguments :: a -> [BucketArgument]
- countingStrategy :: a -> BucketArgument -> ColumnName -> SExp -> Q SExp
- decileQ :: Int -> ColumnName -> Query
- minMaxQuery :: ColumnName -> Query
- bucketCountSubQuery :: forall a b. Histogram a b => a -> ColumnName -> Q (Ref Relation)
- bucketDecileSubQuery :: forall a b. (NumericalStats a, Histogram a b) => a -> Int -> ColumnName -> Q (Ref Relation)
- datePartList :: [DatePart]
- truncateDate :: DatePart -> SExp -> (SExp, SExp)
- truncateDay :: DatePart -> SExp -> (SExp, SExp)
- nameDatePart :: DatePart -> SExp
- class HistogramCleanUp a where
- bucketStrategyCU :: a -> a -> a -> a
- bucketSizeCU :: a -> a -> a
- statQuery :: Bool -> Relation -> UTCTime -> NumericalStat -> ColumnName -> Query
- subQ :: NumericalStats a => a -> ColumnName -> Query
- skewAndKurtQ :: NumericalStats a => a -> ColumnName -> Query
- histogramQuery :: forall a b. Histogram a b => BucketArgument -> Relation -> UTCTime -> a -> ColumnName -> Query
- queryColumnNameSpace :: Ref Table -> Query
- mColumnToNumStatsQuery :: Bool -> NumericalStat -> Relation -> UTCTime -> [Value] -> Maybe Query
- mColumnToHistogramQuery :: forall a b. (Histogram a b, Eq a) => [a] -> Relation -> UTCTime -> [Value] -> Maybe Query
- deconstructStatQueryResult :: [Map Text Value] -> [AEDANumberColumn]
- toNumericalRow :: [(Text, Value)] -> Maybe AEDANumberColumn
- toRow :: Map Text Value -> Maybe AEDAHistogramColumn
- deconstructHistogramQueryResult :: [Map Text Value] -> [AEDAHistogramColumn]
- type NumberHistogramUpdateStrategy = (Name, Double, Double)
- numberHistogramUpdateStrategy :: AEDANumberColumn -> Maybe NumberHistogramUpdateStrategy
- getNumberHistogramUpdateStrategies :: [AEDANumberColumn] -> [NumberHistogramUpdateStrategy]
- updateNumberHistogram :: Int -> NumberHistogramUpdateStrategy -> [AEDAHistogramColumn] -> [AEDAHistogramColumn]
- type TimeHistogramUpdateStrategy = (Name, Day, Name)
- timeHistogramUpdateStrategy :: AEDANumberColumn -> [TimeHistogramUpdateStrategy]
- getTimeHistogramUpdateStrategies :: [AEDANumberColumn] -> [TimeHistogramUpdateStrategy]
- updateTimeHistogram :: Int -> TimeHistogramUpdateStrategy -> [AEDAHistogramColumn] -> [AEDAHistogramColumn]
- weekFirstDay :: DayOfWeek -> Day -> Day
- monthFirstDay :: Day -> Day
- quarterFirstDay :: Day -> Day
- yearFirstDay :: Day -> Day
- nextMonth :: Day -> Day
- successorMonths :: Int -> Day -> Day
- retrieveReport :: Relation -> Day -> Ref Table -> Query
- retrieveReportHistogram :: Relation -> Day -> Ref Table -> Query
- retrieveReportNumerical :: Relation -> Day -> Ref Table -> Query
Documentation
data AEDANumerical b a #
Instances
data AEDANumberColumn #
Instances
data AEDAHistogramColumn #
Instances
class AdditionNumber a where #
fromUTCTime :: UTCTime -> Int #
timeToSeconds' :: TimeOfDay -> Int #
secondsToTime' :: Int -> TimeOfDay #
class AdditionNumber a => ZeroNumber a where #
additionIdentity :: a #
Instances
ZeroNumber Day # | |
Defined in Napkin.Run.BigQuery.AEDATypes additionIdentity :: Day # | |
ZeroNumber UTCTime # | |
Defined in Napkin.Run.BigQuery.AEDATypes | |
ZeroNumber TimeOfDay # | |
Defined in Napkin.Run.BigQuery.AEDATypes | |
ZeroNumber Double # | |
Defined in Napkin.Run.BigQuery.AEDATypes | |
ZeroNumber Int # | |
Defined in Napkin.Run.BigQuery.AEDATypes additionIdentity :: Int # |
data TableDiagnostics #
Instances
data NumericalStat #
Instances
data CategoricalStat #
Instances
Show CategoricalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes showsPrec :: Int -> CategoricalStat -> ShowS # show :: CategoricalStat -> String # showList :: [CategoricalStat] -> ShowS # | |
Eq CategoricalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes (==) :: CategoricalStat -> CategoricalStat -> Bool # (/=) :: CategoricalStat -> CategoricalStat -> Bool # | |
Stats CategoricalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes fromValue :: Value -> Maybe CategoricalStat # toValue :: CategoricalStat -> Value # | |
Histogram CategoricalStat CountGram # | |
Defined in Napkin.Run.BigQuery.AEDATypes bucketStrategy :: CategoricalStat -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp) # bucketSize :: CategoricalStat -> SExp -> SExp -> SExp # additionalArguments :: CategoricalStat -> [BucketArgument] # countingStrategy :: CategoricalStat -> BucketArgument -> ColumnName -> SExp -> Q SExp # |
data ColumnName #
toColumnName :: Value -> Value -> Value -> Value -> ColumnName #
textValueToName :: Value -> Name #
Helper function to transform the result of a Query into a Name.
data DecileGram #
Instances
Histogram NumericalStat DecileGram # | |
Defined in Napkin.Run.BigQuery.AEDATypes bucketStrategy :: NumericalStat -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp) # bucketSize :: NumericalStat -> SExp -> SExp -> SExp # additionalArguments :: NumericalStat -> [BucketArgument] # countingStrategy :: NumericalStat -> BucketArgument -> ColumnName -> SExp -> Q SExp # |
Instances
Histogram CategoricalStat CountGram # | |
Defined in Napkin.Run.BigQuery.AEDATypes bucketStrategy :: CategoricalStat -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp) # bucketSize :: CategoricalStat -> SExp -> SExp -> SExp # additionalArguments :: CategoricalStat -> [BucketArgument] # countingStrategy :: CategoricalStat -> BucketArgument -> ColumnName -> SExp -> Q SExp # | |
Histogram NumericalStat CountGram # | |
Defined in Napkin.Run.BigQuery.AEDATypes bucketStrategy :: NumericalStat -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp) # bucketSize :: NumericalStat -> SExp -> SExp -> SExp # additionalArguments :: NumericalStat -> [BucketArgument] # countingStrategy :: NumericalStat -> BucketArgument -> ColumnName -> SExp -> Q SExp # |
Instances
Stats CategoricalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes fromValue :: Value -> Maybe CategoricalStat # toValue :: CategoricalStat -> Value # | |
Stats NumericalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes fromValue :: Value -> Maybe NumericalStat # toValue :: NumericalStat -> Value # |
class Stats a => NumericalStats a where #
Instances
NumericalStats NumericalStat # | |
Defined in Napkin.Run.BigQuery.AEDATypes fromNumber :: NumericalStat -> SExp -> SExp # toNumber :: NumericalStat -> SExp -> SExp # |
timeToSeconds :: SExp -> SExp #
secondsToTime :: SExp -> SExp #
isDec :: BucketArgument -> Bool #
isDatePartition :: DatePart -> Bool #
isDayPartition :: DatePart -> Bool #
isTimePartition :: DatePart -> Bool #
class Stats a => Histogram a b where #
bucketStrategy :: a -> BucketArgument -> ColumnName -> SExp -> Q (SExp, SExp) #
In order to get a histogram there must be some kind of bucketing strategy and counting strategy, one of which depends on bucketsize. BucketStrategy returns a bucket, histogram name tuple (the name of the histogram is based on the bucket). countingStrategy will return the count of the bucket (either a count, or the particular strategy devised.) The additional arguments defines what additional parameters will be included in histogram query (datePartitions in the case of Date NumericalStats with a Count histogram, Deciles in the case of decile histograms)
bucketSize :: a -> SExp -> SExp -> SExp #
additionalArguments :: a -> [BucketArgument] #
countingStrategy :: a -> BucketArgument -> ColumnName -> SExp -> Q SExp #
Instances
decileQ :: Int -> ColumnName -> Query #
minMaxQuery :: ColumnName -> Query #
bucketCountSubQuery :: forall a b. Histogram a b => a -> ColumnName -> Q (Ref Relation) #
bucketDecileSubQuery :: forall a b. (NumericalStats a, Histogram a b) => a -> Int -> ColumnName -> Q (Ref Relation) #
datePartList :: [DatePart] #
nameDatePart :: DatePart -> SExp #
class HistogramCleanUp a where #
bucketStrategyCU :: a -> a -> a -> a #
In order to cleanup a histogram we need the bucketsize and bucketing strategy.
bucketSizeCU :: a -> a -> a #
Instances
HistogramCleanUp Double # | |
Defined in Napkin.Run.BigQuery.AEDATypes | |
HistogramCleanUp Int # | |
Defined in Napkin.Run.BigQuery.AEDATypes |
statQuery :: Bool -> Relation -> UTCTime -> NumericalStat -> ColumnName -> Query #
Creates a query to gather statistical information from a column (skew, stDev, kurtosis, etc). It is intended to be used on columns of Int, Float, Time, Day, TimeStamp or DateTime (they may be nullable). The function takes a NumericalStat to indicate how it should handle casting in the case of DateTime, TimeStamp, and Day columns.
subQ :: NumericalStats a => a -> ColumnName -> Query #
Gathers statistical primitives that will be used to calculate the various statistics that are supposed to be gathered. Rather than calculating Skew and Kurtosis outright, the various summed powers are calculated first. We can use the fact that multiplication distributes over addition, and expanding the powered subtraction in the numerator, to reduce the number of calculations performed overall.
skewAndKurtQ :: NumericalStats a => a -> ColumnName -> Query #
This function relies on statistical primitives gathered by subQ to calculate skew and kurtosis. For motivating example, (unbiased) Variance is measured as the sum from x = 1 to x = n of each x which a member of X such that square (x - average) / (n - 1). Rather than perform this operation for each member of X, we can note that Sum (square (x -average) / (n - 1)) = 1 (n - 1) * Sum (square (x - average)) = 1 (n - 1) * Sum (square (x) - 2 * average * x + square (average)) = 1 / (n-1) * (Sum (square x) - 2 * average * (Sum x) + square (average)). For very large X we can forgo many un-needed divisions and subtractions and merely calculate based on the summed power of the members of X. We do a similar trick for Skew and Kurtosis in this function.
histogramQuery :: forall a b. Histogram a b => BucketArgument -> Relation -> UTCTime -> a -> ColumnName -> Query #
Takes a Histogram statType and creates a histogram depending on the BucketArgument used (which will be defined in Histogram instance). This accepts both Count Histograms and Decile Histograms.
queryColumnNameSpace :: Ref Table -> Query #
In order for statQuery
and histogramQuery
to work they need to be given a
ColumnName (Ref projectName :| [dataSetName, tableName]) columnName. In order to decide which to perform each
ColumnName will also need an associated data type. For each data set bigquery has a view INFORMATION_SCHEMA.
The INFORMATION_SCHEMA.COLUMNS can be queried for information needed. queryColumnNameSpace produces this query.
In the case of a QueryRelation or table sampling, the relation is first created as a table in the given data
Set so the relations info can be obtained.
mColumnToNumStatsQuery :: Bool -> NumericalStat -> Relation -> UTCTime -> [Value] -> Maybe Query #
These functions takes a single [Values] and returns a Just Query. These are filtered based on the stat-type(s) provided to them. They will be Nothing in the case of a malformed response. mColumnToNumStatsQuery produces stat queries and mColumnToHistogramQuery produces histogram.
mColumnToHistogramQuery :: forall a b. (Histogram a b, Eq a) => [a] -> Relation -> UTCTime -> [Value] -> Maybe Query #
deconstructStatQueryResult :: [Map Text Value] -> [AEDANumberColumn] #
toNumericalRow :: [(Text, Value)] -> Maybe AEDANumberColumn #
type NumberHistogramUpdateStrategy = (Name, Double, Double) #
The histogram is updated iteratively, starting at the bottom bucket and adding the bucket size recursively. For Int and Double the bucket size is calculated using the min and max of the table, either to guarantee 20 equal buckets, or in the case of Ints with little difference just 1. The update strategy is therefore the column name, the bottom bucket, and the bucket size.
updateNumberHistogram :: Int -> NumberHistogramUpdateStrategy -> [AEDAHistogramColumn] -> [AEDAHistogramColumn] #
type TimeHistogramUpdateStrategy = (Name, Day, Name) #
For time and date columns the bucket size is determined by the date partition. Therefore the update strategy is the column name, minimum value, and date partition.
updateTimeHistogram :: Int -> TimeHistogramUpdateStrategy -> [AEDAHistogramColumn] -> [AEDAHistogramColumn] #
weekFirstDay :: DayOfWeek -> Day -> Day #
monthFirstDay :: Day -> Day #
quarterFirstDay :: Day -> Day #
yearFirstDay :: Day -> Day #
successorMonths :: Int -> Day -> Day #