Sql Computation of Cosine Similarity

SQL Computation of Cosine Similarity

with norms as (
    select v,
        sum(w_td * w_td) as w2
    from data
    group by v
)
select 
    x.v as ego,y.v as v,nx.w2 as x2, ny.w2 as y2,
    sum(x.w_td * y.w_td) as innerproduct,
    sum(x.w_td * y.w_td) / sqrt(nx.w2 * ny.w2) as cosinesimilarity
from data as x
join data as y
    on (x.base=y.base)
join norms as nx
    on (nx.v=x.v)
join norms as ny
    on (ny.v=y.v)
where x.v < y.v
group by 1,2,3,4
order by 6 desc

yields

ego|v   |x2  |y2  |innerproduct|cosinesimilarity
--------------------------------------------------
99 |1234|41.0|38.0|23.0        |0.5826987807288609

Optimized approach for calculating cosine similarity in SQL Server

You can make a small optimization by precomputing SQRT(SUM(v1.v * v1.v)) in a temporary table.

To test the speed, I have enriched your test data this way:

INSERT INTO data
SELECT id+N, dimension, v+N
FROM data CROSS JOIN (
    SELECT TOP 999 ROW_NUMBER() OVER (ORDER BY low) AS N 
    FROM master.dbo.spt_values
) T

Now we have 2000 distinct id-s, each with 4 dimensions.

The original query returns 1999000 rows in 29 seconds on my machine.

The following script returns the same data in 21 seconds:

CREATE TABLE #Sums (
    id INT PRIMARY KEY,
    S FLOAT
)

INSERT INTO #Sums (id, S)
SELECT id, SQRT(SUM(v*v)) AS S FROM data GROUP BY id

SELECT x.id1, x.id2, x.sp/(s1.s*s2.s)
FROM (
    select v1.id as id1, v2.id as id2, SUM(v1.v * v2.v) as sp
    from data v1 
    inner join data v2 on v1.dimension =v2.dimension and v1.id<v2.id
    group by v1.id, v2.id
) x
INNER JOIN #Sums s1 ON s1.id=x.id1
INNER JOIN #Sums s2 ON s2.id=x.id2

DROP TABLE #Sums

How to compute cosine similarity between two texts in presto?

Use split to transform string into array and then depended on Presto version either use unnest+histogram trick or array_frequency:

-- sample data
with dataset(id, text1, text2) as (values (1, 'a b b', 'b c'))

-- query
select id, COSINE_SIMILARITY(histogram(t1), histogram(t2))
from dataset,
unnest (split(text1, ' '), split(text2, ' ')) as t(t1, t2)
group by id;

Output:

id	_col1
1	0.6324555320336759

Cosine similarity between pair of arrays in Bigquery

Below is for BigQuery Standard SQL

#standardSQL
SELECT id1, id2, ( 
  SELECT 
    SUM(value1 * value2)/ 
    SQRT(SUM(value1 * value1))/ 
    SQRT(SUM(value2 * value2))
  FROM UNNEST(coord1) value1 WITH OFFSET pos1 
  JOIN UNNEST(coord2) value2 WITH OFFSET pos2 
  ON pos1 = pos2
  ) cosine_similarity
FROM `project.dataset.table`

below is dummy example for you to play with

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 1 id1, [1.0, 2.0, 3.0, 4.0] coord1, 2 id2, [1.0, 2.0, 3.0, 4.0] coord2 UNION ALL
  SELECT 3, [2.0, 0.0, 1.0, 1.0, 0, 2.0, 1.0, 1.0], 4, [2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
)
SELECT id1, id2, ( 
  SELECT 
    SUM(value1 * value2)/
    SQRT(SUM(value1 * value1))/ 
    SQRT(SUM(value2 * value2))
  FROM UNNEST(coord1) value1 WITH OFFSET pos1 
  JOIN UNNEST(coord2) value2 WITH OFFSET pos2 
  ON pos1 = pos2
  ) cosine_similarity
FROM `project.dataset.table`

with result

Row id1 id2 cosine_similarity    
1   1   2   1.0  
2   3   4   0.8215838362577491

Sql Computation of Cosine Similarity