Sql: How to Get All The Distinct Characters in a Column, Across All Rows

SQL: how to get all the distinct characters in a column, across all rows

Given that your column is varchar, it means it can only store characters from codes 0 to 255, on whatever code page you have. If you only use the 32-128 ASCII code range, then you can simply see if you have any of the characters 32-128, one by one. The following query does that, looking in sys.objects.name:

with cteDigits as (
    select 0 as Number
    union all select 1 as Number
    union all select 2 as Number
    union all select 3 as Number
    union all select 4 as Number
    union all select 5 as Number
    union all select 6 as Number
    union all select 7 as Number
    union all select 8 as Number
    union all select 9 as Number)
, cteNumbers as (
    select U.Number + T.Number*10 + H.Number*100 as Number
    from cteDigits U
    cross join cteDigits T
    cross join cteDigits H)
, cteChars as (
    select CHAR(Number) as Char
    from cteNumbers 
    where Number between 32 and 128)
select cteChars.Char as [*]
from cteChars
cross apply (
    select top(1) *
    from sys.objects
    where CHARINDEX(cteChars.Char, name, 0) > 0) as o
for xml path('');

Finding unique characters from a table field for all rows

Try this one -

INSERT INTO @temp (txt)
VALUES ('abcd3'), ('abcdefg*')

SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
 SELECT ch = SUBSTRING(t.mtxt, n.number + 1, 1) 
 FROM [master].dbo.spt_values n
 CROSS JOIN (
  SELECT mtxt = (
   SELECT txt
   FROM @temp
   FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
  )
 ) t
 WHERE [type] = N'p'
  AND number <= LEN(mtxt) - 1
) dt
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)

Example (edited):

SET NOCOUNT ON;

DECLARE @temp TABLE (txt VARCHAR(8000))

INSERT INTO @temp (txt)
VALUES ('abcd'), ('abcdefg*'), (REPLICATE('-', 8000)), (REPLICATE('+', 8000))

DECLARE @t TABLE (i BIGINT)

DECLARE 
      @i BIGINT = 1
    , @l BIGINT = (
        SELECT SUM(LEN(txt))
        FROM @temp 
    )

WHILE (@i <= @l) BEGIN

    INSERT INTO @t (i) 
    VALUES (@i), (@i+1), (@i+2), (@i+3), (@i+4), (@i+5), (@i+6), (@i+7), (@i+8), (@i+9)
    SELECT @i += 10

END

SELECT disword = (
    SELECT DISTINCT dt.ch
    FROM (
        SELECT ch = SUBSTRING(t.mtxt, n.i, 1) 
        FROM @t n
        CROSS JOIN (
            SELECT mtxt = (
                SELECT txt
                FROM @temp
                FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
            )
        ) t
    ) dt
    FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)

Have a look a t this solution -

SELECT 
      dt.ch
    , cnt = COUNT(1)
FROM (
    SELECT ch = SUBSTRING(t.mtxt, n.i, 1)  
    FROM @t n
    CROSS JOIN (
        SELECT mtxt = (
            SELECT txt
            FROM @temp
            FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)')
    ) t
) dt
WHERE dt.ch != ''
GROUP BY dt.ch
ORDER BY cnt DESC

MySQL query to get all the distinct letters used across the rows

You could use SUBSTRING with derived tally table:

SELECT DISTINCT SUBSTRING(word, n, 1) AS letter
FROM words w
CROSS JOIN 
(
   SELECT a.N + b.N * 10 + c.N * 1000 + 1 n
     FROM 
    (SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a
   ,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b
   ,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) c
) n
WHERE n <= LENGTH(w.word)
ORDER BY letter;

SqlFiddleDemo

If you need o and O as different letters add COLLATE utf8_bin:

SELECT DISTINCT SUBSTRING(word, n, 1) COLLATE utf8_bin AS letter

Extract Distinct Special Characters in a Column using SQL Select

There is one place where you need to define your "regular" characters

select  '%[^a-zA-Z0-9 ]%'

with    prm (regular_char)
        as
        (
            select  '%[^a-zA-Z0-9 ]%'
        )

       ,cte (special_char,string_suffix)
        as
        (
            select  ''              as special_char
                   ,CompanyName     as string_suffix

            from    t

            union all

            select  substring (string_suffix,special_char_ind,1)                    as special_char
                   ,substring (string_suffix,special_char_ind+1,len(string_suffix)) as string_suffix

            from   (select  string_suffix
                           ,nullif(patindex(prm.regular_char,string_suffix),0) as special_char_ind
                    from    cte,prm
                    where   string_suffix <> ''
                    )  t

            where   special_char_ind is not null

        )

select      special_char
           ,ascii(special_char) as ascii_special_char
           ,count(*)            as cnt      

from        cte

where       special_char <> ''

group by    special_char

option      (maxrecursion 0)

+--------------+--------------------+-----+
| special_char | ascii_special_char | cnt |
+--------------+--------------------+-----+
|              |  9                 | 1   |
+--------------+--------------------+-----+
| &            | 38                 | 1   |
+--------------+--------------------+-----+
| ,            | 44                 | 1   |
+--------------+--------------------+-----+
| .            | 46                 | 5   |
+--------------+--------------------+-----+
| /            | 47                 | 1   |
+--------------+--------------------+-----+

Count Of Distinct Characters In Column

I would do this by creating a table of your letters similar to:

CREATE TABLE tblLetter
(
  letter varchar(1)
);

INSERT INTO tblLetter ([letter])
VALUES
    ('a'),
    ('b'),
    ('c'),
    ('d'); -- etc

Then you could join the letters to your table where your data is like the letter:

select l.letter, count(n.col) Total
from tblLetter l
inner join names n
  on n.col like '%'+l.letter+'%'
group by l.letter;

See SQL Fiddle with Demo. This would give a result:

| LETTER | TOTAL |
|--------|-------|
|      a |     5 |
|      e |     3 |
|      g |     1 |
|      i |     3 |
|      l |     4 |
|      m |     1 |
|      p |     2 |
|      s |     4 |

Count number of unique characters in a string

There is no direct or easy way of doing it. You may need to write a store function to do the job and by looking at all the characters you may expect in the data. Here is an example for just digits , which could be extended for all the characters in a stored function

mysql> select * from test ;
+------------+
| val        |
+------------+
| 11111111   |
| 111222222  |
| 1113333222 |
+------------+

select 
val, 
sum(case when locate('1',val) > 0 then 1 else 0 end ) 
+ sum( case when locate('2',val) > 0 then 1 else 0 end)
+ sum(case when locate('3',val) > 0 then 1 else 0 end)
+sum(case when locate('4',val) > 0 then 1 else 0 end ) as occurence
from test group by val

+------------+-----------+
| val        | occurence |
+------------+-----------+
| 11111111   |         1 |
| 111222222  |         2 |
| 1113333222 |         3 |
+------------+-----------+

Or if you have enough time , create a lookup table with all the characters you could think of. And make the query in 2 lines

mysql> select * from test ;
+------------+
| val        |
+------------+
| 11111111   |
| 111222222  |
| 1113333222 |
+------------+
3 rows in set (0.00 sec)

mysql> select * from look_up ;
+------+------+
| id   | val  |
+------+------+
|    1 | 1    |
|    2 | 2    |
|    3 | 3    |
|    4 | 4    |
+------+------+
4 rows in set (0.00 sec)

select 
t1.val, 
sum(case when locate(t2.val,t1.val) > 0 then 1 else 0 end ) as occ 
from test t1,(select * from look_up)t2 
group by t1.val ;

+------------+------+
| val        | occ  |
+------------+------+
| 11111111   |    1 |
| 111222222  |    2 |
| 1113333222 |    3 |
+------------+------+

T-SQL - Count unique characters in a variable

Using NGrams8K as a base, you can change the input parameter to a nvarchar(4000) and tweak the DATALENGTH, making NGramsN4K. Then you can use that to split the string into individual characters and count them:

SELECT COUNT(DISTINCT NG.token) AS DistinctCharacters
FROM dbo.NGramsN4k(@String1,1) NG;

Altered NGrams8K:

IF OBJECT_ID('dbo.NGramsN4k','IF') IS NOT NULL DROP FUNCTION dbo.NGramsN4k;
GO
CREATE FUNCTION dbo.NGramsN4k
(
  @string nvarchar(4000), -- Input string 
  @N      int            -- requested token size
)
/****************************************************************************************
Purpose:
 A character-level N-Grams function that outputs a contiguous stream of @N-sized tokens 
 based on an input string (@string). Accepts strings up to 8000 varchar characters long.
 For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram. 

Compatibility: 
 SQL Server 2008+, Azure SQL Database

Syntax:
--===== Autonomous
 SELECT position, token FROM dbo.NGrams8k(@string,@N);

--===== Against a table using APPLY
 SELECT s.SomeID, ng.position, ng.token
 FROM dbo.SomeTable s
 CROSS APPLY dbo.NGrams8K(s.SomeValue,@N) ng;

Parameters:
 @string  = The input string to split into tokens.
 @N       = The size of each token returned.

Returns:
 Position = bigint; the position of the token in the input string
 token    = varchar(8000); a @N-sized character-level N-Gram token

Developer Notes:  
 1. NGrams8k is not case sensitive

 2. Many functions that use NGrams8k will see a huge performance gain when the optimizer
    creates a parallel execution plan. One way to get a parallel query plan (if the 
    optimizer does not chose one) is to use make_parallel by Adam Machanic which can be 
    found here:
 sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx

 3. When @N is less than 1 or greater than the datalength of the input string then no 
    tokens (rows) are returned. If either @string or @N are NULL no rows are returned.
    This is a debatable topic but the thinking behind this decision is that: because you
    can't split 'xxx' into 4-grams, you can't split a NULL value into unigrams and you 
    can't turn anything into NULL-grams, no rows should be returned.

    For people who would prefer that a NULL input forces the function to return a single
    NULL output you could add this code to the end of the function:

    UNION ALL 
    SELECT 1, NULL
    WHERE NOT(@N > 0 AND @N <= DATALENGTH(@string)) OR (@N IS NULL OR @string IS NULL)

 4. NGrams8k can also be used as a tally table with the position column being your "N" 
    row. To do so use REPLICATE to create an imaginary string, then use NGrams8k to split
    it into unigrams then only return the position column. NGrams8k will get you up to 
    8000 numbers. There will be no performance penalty for sorting by position in 
    ascending order but there is for sorting in descending order. To get the numbers in
    descending order without forcing a sort in the query plan use the following formula:
    N = <highest number>-position+1. 

 Pseudo Tally Table Examples:
    --===== (1) Get the numbers 1 to 100 in ascending order:
    SELECT N = position 
    FROM dbo.NGrams8k(REPLICATE(0,100),1);

    --===== (2) Get the numbers 1 to 100 in descending order:
    DECLARE @maxN int = 100;
    SELECT N = @maxN-position+1
    FROM dbo.NGrams8k(REPLICATE(0,@maxN),1)
    ORDER BY position;

 5. NGrams8k is deterministic. For more about deterministic functions see:
    https://msdn.microsoft.com/en-us/library/ms178091.aspx

Usage Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
 SELECT position, token FROM dbo.NGrams8k('abcd',1); -- unigrams (@N=1)
 SELECT position, token FROM dbo.NGrams8k('abcd',2); -- bigrams  (@N=2)
 SELECT position, token FROM dbo.NGrams8k('abcd',3); -- trigrams (@N=3)

--===== How many times the substring "AB" appears in each record
 DECLARE @table TABLE(stringID int identity primary key, string varchar(100));
 INSERT @table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');

 SELECT string, occurances = COUNT(*) 
 FROM @table t
 CROSS APPLY dbo.NGrams8k(t.string,2) ng
 WHERE ng.token = 'AB'
 GROUP BY string;

----------------------------------------------------------------------------------------
Revision History:
 Rev 00 - 20140310 - Initial Development - Alan Burstein
 Rev 01 - 20150522 - Removed DQS N-Grams functionality, improved iTally logic. Also Added
                     conversion to bigint in the TOP logic to remove implicit conversion
                     to bigint - Alan Burstein
 Rev 03 - 20150909 - Added logic to only return values if @N is greater than 0 and less 
                     than the length of @string. Updated comment section. - Alan Burstein
 Rev 04 - 20151029 - Added ISNULL logic to the TOP clause for the @string and @N 
                     parameters to prevent a NULL string or NULL @N from causing "an 
                     improper value" being passed to the TOP clause. - Alan Burstein
****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH 
L1(N) AS 
(
  SELECT 1
  FROM (VALUES    -- 90 NULL values used to create the CTE Tally table
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
        (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)
       ) t(N)
),
iTally(N) AS                                   -- my cte Tally table
(
  SELECT TOP(ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(@string,N''))/2)-(ISNULL(@N,1)-1)),0)))
    ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
  FROM L1 a CROSS JOIN L1 b                    -- cartesian product for 8100 rows (90^2)
)
SELECT
  position = N,                                -- position of the token in the string(s)
  token    = SUBSTRING(@string,CAST(N AS int),@N)  -- the @N-Sized token
FROM iTally
WHERE @N > 0 AND @N <= (DATALENGTH(@string)/2);    -- Protection against bad parameter values

How to get all distinct words of a specified minimum length from multiple columns in a MySQL table?

Shell script might be efficient...

SELECT CONCAT_WS(' ', col_a, col_b, col_c) INTO OUTFILE 'x' ... to get the columns into a file
tr ' ' "\n" <x -- split into one word per line
awk 'length($1) >= 5' -- minimum size of 5 characters per word
sort -u -- to dedup

There are no stopwords, but sed or awk could deal with that.

 mysql -e "SELECT ... INTO OUTFILE 'x' ..." ...
 tr ' ' "\n" <x  |  awk 'length($1) >= 5'  |  sort -u

SQL DISTINCT values across rows

Try with this little change in your query:

select agreementnumber,
  max(case when rn = 1 then telephone end) telephone1,
  max(case when rn = 2 then telephone end) telephone2,   
  max(case when rn = 3 then telephone end) telephone3,
  max(case when rn = 4 then telephone end) telephone4,
  max(case when rn = 5 then telephone end) telephone5

from
(
  select x.*,
    row_number() over(partition by x.agreementnumber order by x.telephone) rn
  from (
    select distinct agreementnumber, telephone
    from alternate_mobile 
  ) x
) src
group by agreementnumber;

If you were getting duplicate telephones is because you have duplicated agreementnumber/telephone in your alternate_mobile table.

Edited:

I change the query to keep just numbers in the telephone, removing all the rest of characters:

select agreementnumber,
  max(case when rn = 1 then telephone end) telephone1,
  max(case when rn = 2 then telephone end) telephone2,   
  max(case when rn = 3 then telephone end) telephone3,
  max(case when rn = 4 then telephone end) telephone4,
  max(case when rn = 5 then telephone end) telephone5

from
(
  select x.*,
    row_number() over(partition by x.agreementnumber order by x.telephone) rn
  from (
    select distinct agreementnumber, regexp_replace(telephone,'[^0-9]', '') as telephone
    from alternate_mobile 
  ) x
) src
group by agreementnumber;

How do I get distinct characters of string column in mssql?

DECLARE @result VARCHAR(MAX)
SET @result = ''

DECLARE  @t TABLE(name VARCHAR(400))

INSERT INTO @t 
SELECT 'Josef Knoller' UNION ALL SELECT 'Josef Somos' UNION ALL SELECT 'KFZ Wiesauer'

;WITH 
L0 AS (SELECT 1 AS c UNION ALL SELECT 1),
L1 AS (SELECT 1 AS c FROM L0 A CROSS JOIN L0 B),
L2 AS (SELECT 1 AS c FROM L1 A CROSS JOIN L1 B),
L3 AS (SELECT 1 AS c FROM L2 A CROSS JOIN L2 B),
L4 AS (SELECT 1 AS c FROM L3 A CROSS JOIN L3 B),
Nums AS (SELECT ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS i FROM L4),
FilteredNums AS (SELECT i FROM Nums WHERE i<= 400),
Letters AS(
SELECT UPPER(SUBSTRING(name,i,1)) AS L, ROW_NUMBER() OVER (ORDER BY name,i) AS RN
FROM @t
JOIN FilteredNums ON FilteredNums.i <= LEN(name))

SELECT @result = @result + L 
FROM Letters
GROUP BY L
ORDER BY MIN(RN)

SELECT @result

Sql: How to Get All The Distinct Characters in a Column, Across All Rows