Guys, consider I have the following column, whats the best approach to calculate the statistical mode of each row? e.g 1251223 would give me mode of 2.
not sure if ideal, but this gives me the mode, as in most frequent digit on each word
WITH s AS (
SELECT '32-4403' AS w
UNION ALL SELECT '750343'
UNION ALL SELECT '271123-'
UNION ALL SELECT NULL
UNION ALL SELECT '636175-'
UNION ALL SELECT '33'
UNION ALL SELECT '26/221-'
UNION ALL SELECT '04-'
UNION ALL SELECT 'x7803'
UNION ALL SELECT '55658'
UNION ALL SELECT '97807'
UNION ALL SELECT '42336'
UNION ALL SELECT '75222'
UNION ALL SELECT '04257'
UNION ALL SELECT '65026'
UNION ALL SELECT '83243'
),
cleaned AS (
SELECT w, regexp_replace(w, '[^0-9]', '') AS w2 FROM s
),
counts AS (
SELECT
w, w2,
length(regexp_replace(w2, '[^0]', '')) AS zeroes,
length(regexp_replace(w2, '[^1]', '')) AS ones,
length(regexp_replace(w2, '[^2]', '')) AS twos,
length(regexp_replace(w2, '[^3]', '')) AS threes,
length(regexp_replace(w2, '[^4]', '')) AS fours,
length(regexp_replace(w2, '[^5]', '')) AS fives,
length(regexp_replace(w2, '[^6]', '')) AS sixes,
length(regexp_replace(w2, '[^7]', '')) AS sevens,
length(regexp_replace(w2, '[^8]', '')) AS eights,
length(regexp_replace(w2, '[^9]', '')) AS nines
FROM cleaned
)
SELECT
w, w2,
CASE
WHEN w2 is NULL THEN null
WHEN zeroes = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 0
WHEN ones = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 1
WHEN twos = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 2
WHEN threes = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 3
WHEN fours = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 4
WHEN fives = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 5
WHEN sixes = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 6
WHEN sevens = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 7
WHEN eights = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 8
WHEN nines = GREATEST(zeroes, ones, twos, threes, fours, fives, sixes, sevens, eights, nines) THEN 9
END AS mode
FROM counts;
I clean anything not a digit, then I count the number of times each digit appears on the string (filter only that digit and get the string length), and then I check which number has the count equal to the greatest for all the numbers.In the case of a tie, the smaller digit wins
This way you see the most frequent numbers and those with the same frequency.
WITH s AS (
SELECT '32-4403' AS w
UNION ALL SELECT '750343'
UNION ALL SELECT '271123-'
UNION ALL SELECT NULL
UNION ALL SELECT '636175-'
UNION ALL SELECT '33'
UNION ALL SELECT '26/221-'
UNION ALL SELECT '04-'
UNION ALL SELECT 'x7803'
UNION ALL SELECT '55658'
UNION ALL SELECT '97807'
UNION ALL SELECT '42336'
UNION ALL SELECT '75222'
UNION ALL SELECT '04257'
UNION ALL SELECT '65026'
UNION ALL SELECT '83243'
),
nb AS (
SELECT (x-1)::int as n FROM long_sequence(10)
),
digits AS (
SELECT s.w, SUBSTRING(s.w, nb.n + 1, 1) AS digit
FROM s
JOIN nb ON LENGTH(s.w) > nb.n
WHERE s.w IS NOT NULL AND SUBSTRING(s.w, nb.n + 1, 1) IN ('0','1','2','3','4','5','6','7','8','9')
),
frequencies AS (
SELECT w, digit, COUNT(*) AS freq
FROM digits
GROUP BY w, digit
),
max_frequencies AS (
SELECT w, MAX(freq) AS max_freq
FROM frequencies
GROUP BY w
)
SELECT f.w, f.digit, f.freq AS fre
FROM frequencies f
JOIN max_frequencies m ON f.w = m.w AND f.freq = m.max_freq
;
If you replace this part:
max_frequencies AS (
SELECT w, MAX(freq) AS max_freq
FROM frequencies
GROUP BY w
)
SELECT f.w, f.digit, f.freq AS fre
FROM frequencies f
JOIN max_frequencies m ON f.w = m.w AND f.freq = m.max_freq
;
with this:
final AS (
SELECT
w, digit, freq, ROW_NUMBER() OVER (PARTITION BY w ORDER BY freq DESC, digit ASC) AS rn
FROM frequencies
)
SELECT w, digit, freq AS fre,
FROM final
WHERE rn = 1
;
you will only see the smallest one
Of course, in the part
AND SUBSTRING(s.w, nb.n + 1, 1) IN (β0β,β1β,β2β,β3β,β4β,β5β,β6β,β7β,β8β,β9β)
you can enter as many characters as you want to count (in this case, just the digits).
If you remove it completely, it will count all the characters.
Very cool!
I see in your example you are going for strings of max 10 characters, as per the long_sequence. Unfortunately long_sequence does not accept a dynamic value, so we cannot just generate as many numbers as the max length, but there is a trick we can do. You can enter an arbitrarily large number on nb, then add a condition on digits
nb AS (
SELECT (x-1)::int as n FROM long_sequence(1000)
),
digits AS (
SELECT s.w, SUBSTRING(s.w, nb.n + 1, 1) AS digit
FROM s
JOIN nb ON LENGTH(s.w) > nb.n
WHERE
nb.n::timestamp < (SELECT max(length(w))::timestamp FROM s) AND
s.w IS NOT NULL AND SUBSTRING(s.w, nb.n + 1, 1) IN ('0','1','2','3','4','5','6','7','8','9')
),
Subqueries in conditions are only allowed for timestamps, but I am doing the trick of converting both sides to timestamps (since it is a positive int, this should work fine) then comparing them.
Itβs a too bad that long_sequence doesnβt accept a dynamic value.
It could accept long_sequence(length(w)) and maybe accept a long, int, double parameter
Yep. Sequences have lots of issues indeed. It is a pity we also donβt support dynamic values on generate_series http://localhost:3001/docs/reference/function/row-generator/#generate_series
