Sql: How to Get All The Distinct Characters in a Column, Across All Rows

SQL: how to get all the distinct characters in a column, across all rows

Given that your column is varchar, it means it can only store characters from codes 0 to 255, on whatever code page you have. If you only use the 32-128 ASCII code range, then you can simply see if you have any of the characters 32-128, one by one. The following query does that, looking in sys.objects.name:

with cteDigits as (
select 0 as Number
union all select 1 as Number
union all select 2 as Number
union all select 3 as Number
union all select 4 as Number
union all select 5 as Number
union all select 6 as Number
union all select 7 as Number
union all select 8 as Number
union all select 9 as Number)
, cteNumbers as (
select U.Number + T.Number*10 + H.Number*100 as Number
from cteDigits U
cross join cteDigits T
cross join cteDigits H)
, cteChars as (
select CHAR(Number) as Char
from cteNumbers
where Number between 32 and 128)
select cteChars.Char as [*]
from cteChars
cross apply (
select top(1) *
from sys.objects
where CHARINDEX(cteChars.Char, name, 0) > 0) as o
for xml path('');

Finding unique characters from a table field for all rows

Try this one -

INSERT INTO @temp (txt)
VALUES ('abcd3'), ('abcdefg*')

SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.number + 1, 1)
FROM [master].dbo.spt_values n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
) t
WHERE [type] = N'p'
AND number <= LEN(mtxt) - 1
) dt
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)

Example (edited):

SET NOCOUNT ON;

DECLARE @temp TABLE (txt VARCHAR(8000))

INSERT INTO @temp (txt)
VALUES ('abcd'), ('abcdefg*'), (REPLICATE('-', 8000)), (REPLICATE('+', 8000))

DECLARE @t TABLE (i BIGINT)

DECLARE
@i BIGINT = 1
, @l BIGINT = (
SELECT SUM(LEN(txt))
FROM @temp
)

WHILE (@i <= @l) BEGIN

INSERT INTO @t (i)
VALUES (@i), (@i+1), (@i+2), (@i+3), (@i+4), (@i+5), (@i+6), (@i+7), (@i+8), (@i+9)
SELECT @i += 10

END

SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.i, 1)
FROM @t n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
) t
) dt
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)

Have a look a t this solution -

SELECT 
dt.ch
, cnt = COUNT(1)
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.i, 1)
FROM @t n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)')
) t
) dt
WHERE dt.ch != ''
GROUP BY dt.ch
ORDER BY cnt DESC

MySQL query to get all the distinct letters used across the rows

You could use SUBSTRING with derived tally table:

SELECT DISTINCT SUBSTRING(word, n, 1) AS letter
FROM words w
CROSS JOIN
(
SELECT a.N + b.N * 10 + c.N * 1000 + 1 n
FROM
(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) c
) n
WHERE n <= LENGTH(w.word)
ORDER BY letter;

SqlFiddleDemo

If you need o and O as different letters add COLLATE utf8_bin:

SELECT DISTINCT SUBSTRING(word, n, 1) COLLATE utf8_bin AS letter 

Extract Distinct Special Characters in a Column using SQL Select

There is one place where you need to define your "regular" characters

select  '%[^a-zA-Z0-9 ]%'

with    prm (regular_char)
as
(
select '%[^a-zA-Z0-9 ]%'
)

,cte (special_char,string_suffix)
as
(
select '' as special_char
,CompanyName as string_suffix

from t

union all

select substring (string_suffix,special_char_ind,1) as special_char
,substring (string_suffix,special_char_ind+1,len(string_suffix)) as string_suffix

from (select string_suffix
,nullif(patindex(prm.regular_char,string_suffix),0) as special_char_ind
from cte,prm
where string_suffix <> ''
) t

where special_char_ind is not null

)

select special_char
,ascii(special_char) as ascii_special_char
,count(*) as cnt

from cte

where special_char <> ''

group by special_char

option (maxrecursion 0)

+--------------+--------------------+-----+
| special_char | ascii_special_char | cnt |
+--------------+--------------------+-----+
| | 9 | 1 |
+--------------+--------------------+-----+
| & | 38 | 1 |
+--------------+--------------------+-----+
| , | 44 | 1 |
+--------------+--------------------+-----+
| . | 46 | 5 |
+--------------+--------------------+-----+
| / | 47 | 1 |
+--------------+--------------------+-----+

Count Of Distinct Characters In Column

I would do this by creating a table of your letters similar to:

CREATE TABLE tblLetter
(
letter varchar(1)
);

INSERT INTO tblLetter ([letter])
VALUES
('a'),
('b'),
('c'),
('d'); -- etc

Then you could join the letters to your table where your data is like the letter:

select l.letter, count(n.col) Total
from tblLetter l
inner join names n
on n.col like '%'+l.letter+'%'
group by l.letter;

See SQL Fiddle with Demo. This would give a result:

| LETTER | TOTAL |
|--------|-------|
| a | 5 |
| e | 3 |
| g | 1 |
| i | 3 |
| l | 4 |
| m | 1 |
| p | 2 |
| s | 4 |

Count number of unique characters in a string

There is no direct or easy way of doing it. You may need to write a store function to do the job and by looking at all the characters you may expect in the data. Here is an example for just digits , which could be extended for all the characters in a stored function

mysql> select * from test ;
+------------+
| val |
+------------+
| 11111111 |
| 111222222 |
| 1113333222 |
+------------+

select
val,
sum(case when locate('1',val) > 0 then 1 else 0 end )
+ sum( case when locate('2',val) > 0 then 1 else 0 end)
+ sum(case when locate('3',val) > 0 then 1 else 0 end)
+sum(case when locate('4',val) > 0 then 1 else 0 end ) as occurence
from test group by val

+------------+-----------+
| val | occurence |
+------------+-----------+
| 11111111 | 1 |
| 111222222 | 2 |
| 1113333222 | 3 |
+------------+-----------+

Or if you have enough time , create a lookup table with all the characters you could think of. And make the query in 2 lines

mysql> select * from test ;
+------------+
| val |
+------------+
| 11111111 |
| 111222222 |
| 1113333222 |
+------------+
3 rows in set (0.00 sec)

mysql> select * from look_up ;
+------+------+
| id | val |
+------+------+
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
+------+------+
4 rows in set (0.00 sec)

select
t1.val,
sum(case when locate(t2.val,t1.val) > 0 then 1 else 0 end ) as occ
from test t1,(select * from look_up)t2
group by t1.val ;

+------------+------+
| val | occ |
+------------+------+
| 11111111 | 1 |
| 111222222 | 2 |
| 1113333222 | 3 |
+------------+------+

T-SQL - Count unique characters in a variable

Using NGrams8K as a base, you can change the input parameter to a nvarchar(4000) and tweak the DATALENGTH, making NGramsN4K. Then you can use that to split the string into individual characters and count them:

SELECT COUNT(DISTINCT NG.token) AS DistinctCharacters
FROM dbo.NGramsN4k(@String1,1) NG;

Altered NGrams8K:

IF OBJECT_ID('dbo.NGramsN4k','IF') IS NOT NULL DROP FUNCTION dbo.NGramsN4k;
GO
CREATE FUNCTION dbo.NGramsN4k
(
@string nvarchar(4000), -- Input string
@N int -- requested token size
)
/****************************************************************************************
Purpose:
A character-level N-Grams function that outputs a contiguous stream of @N-sized tokens
based on an input string (@string). Accepts strings up to 8000 varchar characters long.
For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram.

Compatibility:
SQL Server 2008+, Azure SQL Database

Syntax:
--===== Autonomous
SELECT position, token FROM dbo.NGrams8k(@string,@N);

--===== Against a table using APPLY
SELECT s.SomeID, ng.position, ng.token
FROM dbo.SomeTable s
CROSS APPLY dbo.NGrams8K(s.SomeValue,@N) ng;

Parameters:
@string = The input string to split into tokens.
@N = The size of each token returned.

Returns:
Position = bigint; the position of the token in the input string
token = varchar(8000); a @N-sized character-level N-Gram token

Developer Notes:
1. NGrams8k is not case sensitive

2. Many functions that use NGrams8k will see a huge performance gain when the optimizer
creates a parallel execution plan. One way to get a parallel query plan (if the
optimizer does not chose one) is to use make_parallel by Adam Machanic which can be
found here:
sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx

3. When @N is less than 1 or greater than the datalength of the input string then no
tokens (rows) are returned. If either @string or @N are NULL no rows are returned.
This is a debatable topic but the thinking behind this decision is that: because you
can't split 'xxx' into 4-grams, you can't split a NULL value into unigrams and you
can't turn anything into NULL-grams, no rows should be returned.

For people who would prefer that a NULL input forces the function to return a single
NULL output you could add this code to the end of the function:

UNION ALL
SELECT 1, NULL
WHERE NOT(@N > 0 AND @N <= DATALENGTH(@string)) OR (@N IS NULL OR @string IS NULL)

4. NGrams8k can also be used as a tally table with the position column being your "N"
row. To do so use REPLICATE to create an imaginary string, then use NGrams8k to split
it into unigrams then only return the position column. NGrams8k will get you up to
8000 numbers. There will be no performance penalty for sorting by position in
ascending order but there is for sorting in descending order. To get the numbers in
descending order without forcing a sort in the query plan use the following formula:
N = <highest number>-position+1.

Pseudo Tally Table Examples:
--===== (1) Get the numbers 1 to 100 in ascending order:
SELECT N = position
FROM dbo.NGrams8k(REPLICATE(0,100),1);

--===== (2) Get the numbers 1 to 100 in descending order:
DECLARE @maxN int = 100;
SELECT N = @maxN-position+1
FROM dbo.NGrams8k(REPLICATE(0,@maxN),1)
ORDER BY position;

5. NGrams8k is deterministic. For more about deterministic functions see:
https://msdn.microsoft.com/en-us/library/ms178091.aspx

Usage Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
SELECT position, token FROM dbo.NGrams8k('abcd',1); -- unigrams (@N=1)
SELECT position, token FROM dbo.NGrams8k('abcd',2); -- bigrams (@N=2)
SELECT position, token FROM dbo.NGrams8k('abcd',3); -- trigrams (@N=3)

--===== How many times the substring "AB" appears in each record
DECLARE @table TABLE(stringID int identity primary key, string varchar(100));
INSERT @table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');

SELECT string, occurances = COUNT(*)
FROM @table t
CROSS APPLY dbo.NGrams8k(t.string,2) ng
WHERE ng.token = 'AB'
GROUP BY string;

----------------------------------------------------------------------------------------
Revision History:
Rev 00 - 20140310 - Initial Development - Alan Burstein
Rev 01 - 20150522 - Removed DQS N-Grams functionality, improved iTally logic. Also Added
conversion to bigint in the TOP logic to remove implicit conversion
to bigint - Alan Burstein
Rev 03 - 20150909 - Added logic to only return values if @N is greater than 0 and less
than the length of @string. Updated comment section. - Alan Burstein
Rev 04 - 20151029 - Added ISNULL logic to the TOP clause for the @string and @N
parameters to prevent a NULL string or NULL @N from causing "an
improper value" being passed to the TOP clause. - Alan Burstein
****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH
L1(N) AS
(
SELECT 1
FROM (VALUES -- 90 NULL values used to create the CTE Tally table
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)
) t(N)
),
iTally(N) AS -- my cte Tally table
(
SELECT TOP(ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(@string,N''))/2)-(ISNULL(@N,1)-1)),0)))
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
FROM L1 a CROSS JOIN L1 b -- cartesian product for 8100 rows (90^2)
)
SELECT
position = N, -- position of the token in the string(s)
token = SUBSTRING(@string,CAST(N AS int),@N) -- the @N-Sized token
FROM iTally
WHERE @N > 0 AND @N <= (DATALENGTH(@string)/2); -- Protection against bad parameter values

How to get all distinct words of a specified minimum length from multiple columns in a MySQL table?

Shell script might be efficient...

  1. SELECT CONCAT_WS(' ', col_a, col_b, col_c) INTO OUTFILE 'x' ... to get the columns into a file
  2. tr ' ' "\n" <x -- split into one word per line
  3. awk 'length($1) >= 5' -- minimum size of 5 characters per word
  4. sort -u -- to dedup

There are no stopwords, but sed or awk could deal with that.

 mysql -e "SELECT ... INTO OUTFILE 'x' ..." ...
tr ' ' "\n" <x | awk 'length($1) >= 5' | sort -u

SQL DISTINCT values across rows

Try with this little change in your query:

select agreementnumber,
max(case when rn = 1 then telephone end) telephone1,
max(case when rn = 2 then telephone end) telephone2,
max(case when rn = 3 then telephone end) telephone3,
max(case when rn = 4 then telephone end) telephone4,
max(case when rn = 5 then telephone end) telephone5

from
(
select x.*,
row_number() over(partition by x.agreementnumber order by x.telephone) rn
from (
select distinct agreementnumber, telephone
from alternate_mobile
) x
) src
group by agreementnumber;

If you were getting duplicate telephones is because you have duplicated agreementnumber/telephone in your alternate_mobile table.

Edited:

I change the query to keep just numbers in the telephone, removing all the rest of characters:

select agreementnumber,
max(case when rn = 1 then telephone end) telephone1,
max(case when rn = 2 then telephone end) telephone2,
max(case when rn = 3 then telephone end) telephone3,
max(case when rn = 4 then telephone end) telephone4,
max(case when rn = 5 then telephone end) telephone5

from
(
select x.*,
row_number() over(partition by x.agreementnumber order by x.telephone) rn
from (
select distinct agreementnumber, regexp_replace(telephone,'[^0-9]', '') as telephone
from alternate_mobile
) x
) src
group by agreementnumber;

How do I get distinct characters of string column in mssql?

DECLARE @result VARCHAR(MAX)
SET @result = ''

DECLARE @t TABLE(name VARCHAR(400))

INSERT INTO @t
SELECT 'Josef Knoller' UNION ALL SELECT 'Josef Somos' UNION ALL SELECT 'KFZ Wiesauer'

;WITH
L0 AS (SELECT 1 AS c UNION ALL SELECT 1),
L1 AS (SELECT 1 AS c FROM L0 A CROSS JOIN L0 B),
L2 AS (SELECT 1 AS c FROM L1 A CROSS JOIN L1 B),
L3 AS (SELECT 1 AS c FROM L2 A CROSS JOIN L2 B),
L4 AS (SELECT 1 AS c FROM L3 A CROSS JOIN L3 B),
Nums AS (SELECT ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS i FROM L4),
FilteredNums AS (SELECT i FROM Nums WHERE i<= 400),
Letters AS(
SELECT UPPER(SUBSTRING(name,i,1)) AS L, ROW_NUMBER() OVER (ORDER BY name,i) AS RN
FROM @t
JOIN FilteredNums ON FilteredNums.i <= LEN(name))

SELECT @result = @result + L
FROM Letters
GROUP BY L
ORDER BY MIN(RN)

SELECT @result


Related Topics



Leave a reply



Submit