Sql Computation of Cosine Similarity

SQL Computation of Cosine Similarity

with norms as (
select v,
sum(w_td * w_td) as w2
from data
group by v
)
select
x.v as ego,y.v as v,nx.w2 as x2, ny.w2 as y2,
sum(x.w_td * y.w_td) as innerproduct,
sum(x.w_td * y.w_td) / sqrt(nx.w2 * ny.w2) as cosinesimilarity
from data as x
join data as y
on (x.base=y.base)
join norms as nx
on (nx.v=x.v)
join norms as ny
on (ny.v=y.v)
where x.v < y.v
group by 1,2,3,4
order by 6 desc

yields

ego|v   |x2  |y2  |innerproduct|cosinesimilarity
--------------------------------------------------
99 |1234|41.0|38.0|23.0 |0.5826987807288609

Optimized approach for calculating cosine similarity in SQL Server

You can make a small optimization by precomputing SQRT(SUM(v1.v * v1.v)) in a temporary table.

To test the speed, I have enriched your test data this way:

INSERT INTO data
SELECT id+N, dimension, v+N
FROM data CROSS JOIN (
SELECT TOP 999 ROW_NUMBER() OVER (ORDER BY low) AS N
FROM master.dbo.spt_values
) T

Now we have 2000 distinct id-s, each with 4 dimensions.

The original query returns 1999000 rows in 29 seconds on my machine.

The following script returns the same data in 21 seconds:

CREATE TABLE #Sums (
id INT PRIMARY KEY,
S FLOAT
)

INSERT INTO #Sums (id, S)
SELECT id, SQRT(SUM(v*v)) AS S FROM data GROUP BY id

SELECT x.id1, x.id2, x.sp/(s1.s*s2.s)
FROM (
select v1.id as id1, v2.id as id2, SUM(v1.v * v2.v) as sp
from data v1
inner join data v2 on v1.dimension =v2.dimension and v1.id<v2.id
group by v1.id, v2.id
) x
INNER JOIN #Sums s1 ON s1.id=x.id1
INNER JOIN #Sums s2 ON s2.id=x.id2

DROP TABLE #Sums

How to compute cosine similarity between two texts in presto?

Use split to transform string into array and then depended on Presto version either use unnest+histogram trick or array_frequency:

-- sample data
with dataset(id, text1, text2) as (values (1, 'a b b', 'b c'))

-- query
select id, COSINE_SIMILARITY(histogram(t1), histogram(t2))
from dataset,
unnest (split(text1, ' '), split(text2, ' ')) as t(t1, t2)
group by id;

Output:















id_col1
10.6324555320336759

Cosine similarity between pair of arrays in Bigquery

Below is for BigQuery Standard SQL

#standardSQL
SELECT id1, id2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(coord1) value1 WITH OFFSET pos1
JOIN UNNEST(coord2) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table`

below is dummy example for you to play with

#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id1, [1.0, 2.0, 3.0, 4.0] coord1, 2 id2, [1.0, 2.0, 3.0, 4.0] coord2 UNION ALL
SELECT 3, [2.0, 0.0, 1.0, 1.0, 0, 2.0, 1.0, 1.0], 4, [2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
)
SELECT id1, id2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(coord1) value1 WITH OFFSET pos1
JOIN UNNEST(coord2) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table`

with result

Row id1 id2 cosine_similarity    
1 1 2 1.0
2 3 4 0.8215838362577491


Related Topics



Leave a reply



Submit