SQL Computation of Cosine Similarity
with norms as (
select v,
sum(w_td * w_td) as w2
from data
group by v
)
select
x.v as ego,y.v as v,nx.w2 as x2, ny.w2 as y2,
sum(x.w_td * y.w_td) as innerproduct,
sum(x.w_td * y.w_td) / sqrt(nx.w2 * ny.w2) as cosinesimilarity
from data as x
join data as y
on (x.base=y.base)
join norms as nx
on (nx.v=x.v)
join norms as ny
on (ny.v=y.v)
where x.v < y.v
group by 1,2,3,4
order by 6 desc
yields
ego|v |x2 |y2 |innerproduct|cosinesimilarity
--------------------------------------------------
99 |1234|41.0|38.0|23.0 |0.5826987807288609
Optimized approach for calculating cosine similarity in SQL Server
You can make a small optimization by precomputing SQRT(SUM(v1.v * v1.v))
in a temporary table.
To test the speed, I have enriched your test data this way:
INSERT INTO data
SELECT id+N, dimension, v+N
FROM data CROSS JOIN (
SELECT TOP 999 ROW_NUMBER() OVER (ORDER BY low) AS N
FROM master.dbo.spt_values
) T
Now we have 2000 distinct id-s, each with 4 dimensions.
The original query returns 1999000 rows in 29 seconds on my machine.
The following script returns the same data in 21 seconds:
CREATE TABLE #Sums (
id INT PRIMARY KEY,
S FLOAT
)
INSERT INTO #Sums (id, S)
SELECT id, SQRT(SUM(v*v)) AS S FROM data GROUP BY id
SELECT x.id1, x.id2, x.sp/(s1.s*s2.s)
FROM (
select v1.id as id1, v2.id as id2, SUM(v1.v * v2.v) as sp
from data v1
inner join data v2 on v1.dimension =v2.dimension and v1.id<v2.id
group by v1.id, v2.id
) x
INNER JOIN #Sums s1 ON s1.id=x.id1
INNER JOIN #Sums s2 ON s2.id=x.id2
DROP TABLE #Sums
How to compute cosine similarity between two texts in presto?
Use split
to transform string into array and then depended on Presto version either use unnest
+histogram
trick or array_frequency
:
-- sample data
with dataset(id, text1, text2) as (values (1, 'a b b', 'b c'))
-- query
select id, COSINE_SIMILARITY(histogram(t1), histogram(t2))
from dataset,
unnest (split(text1, ' '), split(text2, ' ')) as t(t1, t2)
group by id;
Output:
id | _col1 |
---|---|
1 | 0.6324555320336759 |
Cosine similarity between pair of arrays in Bigquery
Below is for BigQuery Standard SQL
#standardSQL
SELECT id1, id2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(coord1) value1 WITH OFFSET pos1
JOIN UNNEST(coord2) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table`
below is dummy example for you to play with
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id1, [1.0, 2.0, 3.0, 4.0] coord1, 2 id2, [1.0, 2.0, 3.0, 4.0] coord2 UNION ALL
SELECT 3, [2.0, 0.0, 1.0, 1.0, 0, 2.0, 1.0, 1.0], 4, [2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
)
SELECT id1, id2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(coord1) value1 WITH OFFSET pos1
JOIN UNNEST(coord2) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table`
with result
Row id1 id2 cosine_similarity
1 1 2 1.0
2 3 4 0.8215838362577491
Related Topics
How to Find Rows Which Are Duplicates by a Key But Not Duplicates in All Columns
Recursive Query Challenge - Simple Parent/Child Example
Sql Query to Find Last Day of Current Month
Use a Like Statement on SQL Server Xml Datatype
How to Call Scalar Function in SQL Server 2008
If I Update a View, Will My Original Tables Get Updated
How to Get the Last Month Data and Month to Date Data
How to Use a Trim Function in SQL Server
Efficiently Duplicate Some Rows in Postgresql Table
Using a View with No Primary Key with Entity
Undelete Recently Deleted Rows SQL Server
How to Exclude Tables from Sp_Msforeachtable
How to Get Array/Bag of Elements from Hive Group by Operator
Drop All Tables Sharing The Same Prefix in Postgres
Why Is There No "Product()" Aggregate Function in Sql