SQL: how to get all the distinct characters in a column, across all rows
Given that your column is varchar, it means it can only store characters from codes 0 to 255, on whatever code page you have. If you only use the 32-128 ASCII code range, then you can simply see if you have any of the characters 32-128, one by one. The following query does that, looking in sys.objects.name:
with cteDigits as (
select 0 as Number
union all select 1 as Number
union all select 2 as Number
union all select 3 as Number
union all select 4 as Number
union all select 5 as Number
union all select 6 as Number
union all select 7 as Number
union all select 8 as Number
union all select 9 as Number)
, cteNumbers as (
select U.Number + T.Number*10 + H.Number*100 as Number
from cteDigits U
cross join cteDigits T
cross join cteDigits H)
, cteChars as (
select CHAR(Number) as Char
from cteNumbers
where Number between 32 and 128)
select cteChars.Char as [*]
from cteChars
cross apply (
select top(1) *
from sys.objects
where CHARINDEX(cteChars.Char, name, 0) > 0) as o
for xml path('');
Finding unique characters from a table field for all rows
Try this one -
INSERT INTO @temp (txt)
VALUES ('abcd3'), ('abcdefg*')
SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.number + 1, 1)
FROM [master].dbo.spt_values n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
) t
WHERE [type] = N'p'
AND number <= LEN(mtxt) - 1
) dt
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
Example (edited):
SET NOCOUNT ON;
DECLARE @temp TABLE (txt VARCHAR(8000))
INSERT INTO @temp (txt)
VALUES ('abcd'), ('abcdefg*'), (REPLICATE('-', 8000)), (REPLICATE('+', 8000))
DECLARE @t TABLE (i BIGINT)
DECLARE
@i BIGINT = 1
, @l BIGINT = (
SELECT SUM(LEN(txt))
FROM @temp
)
WHILE (@i <= @l) BEGIN
INSERT INTO @t (i)
VALUES (@i), (@i+1), (@i+2), (@i+3), (@i+4), (@i+5), (@i+6), (@i+7), (@i+8), (@i+9)
SELECT @i += 10
END
SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.i, 1)
FROM @t n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
) t
) dt
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)'
)
Have a look a t this solution -
SELECT
dt.ch
, cnt = COUNT(1)
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.i, 1)
FROM @t n
CROSS JOIN (
SELECT mtxt = (
SELECT txt
FROM @temp
FOR XML PATH(N''), TYPE, ROOT).value(N'root[1]', N'NVARCHAR(MAX)')
) t
) dt
WHERE dt.ch != ''
GROUP BY dt.ch
ORDER BY cnt DESC
MySQL query to get all the distinct letters used across the rows
You could use SUBSTRING
with derived tally table:
SELECT DISTINCT SUBSTRING(word, n, 1) AS letter
FROM words w
CROSS JOIN
(
SELECT a.N + b.N * 10 + c.N * 1000 + 1 n
FROM
(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) c
) n
WHERE n <= LENGTH(w.word)
ORDER BY letter;
SqlFiddleDemo
If you need o
and O
as different letters add COLLATE utf8_bin
:
SELECT DISTINCT SUBSTRING(word, n, 1) COLLATE utf8_bin AS letter
Extract Distinct Special Characters in a Column using SQL Select
There is one place where you need to define your "regular" characters
select '%[^a-zA-Z0-9 ]%'
with prm (regular_char)
as
(
select '%[^a-zA-Z0-9 ]%'
)
,cte (special_char,string_suffix)
as
(
select '' as special_char
,CompanyName as string_suffix
from t
union all
select substring (string_suffix,special_char_ind,1) as special_char
,substring (string_suffix,special_char_ind+1,len(string_suffix)) as string_suffix
from (select string_suffix
,nullif(patindex(prm.regular_char,string_suffix),0) as special_char_ind
from cte,prm
where string_suffix <> ''
) t
where special_char_ind is not null
)
select special_char
,ascii(special_char) as ascii_special_char
,count(*) as cnt
from cte
where special_char <> ''
group by special_char
option (maxrecursion 0)
+--------------+--------------------+-----+
| special_char | ascii_special_char | cnt |
+--------------+--------------------+-----+
| | 9 | 1 |
+--------------+--------------------+-----+
| & | 38 | 1 |
+--------------+--------------------+-----+
| , | 44 | 1 |
+--------------+--------------------+-----+
| . | 46 | 5 |
+--------------+--------------------+-----+
| / | 47 | 1 |
+--------------+--------------------+-----+
Count Of Distinct Characters In Column
I would do this by creating a table of your letters similar to:
CREATE TABLE tblLetter
(
letter varchar(1)
);
INSERT INTO tblLetter ([letter])
VALUES
('a'),
('b'),
('c'),
('d'); -- etc
Then you could join the letters
to your table where your data is like the letter:
select l.letter, count(n.col) Total
from tblLetter l
inner join names n
on n.col like '%'+l.letter+'%'
group by l.letter;
See SQL Fiddle with Demo. This would give a result:
| LETTER | TOTAL |
|--------|-------|
| a | 5 |
| e | 3 |
| g | 1 |
| i | 3 |
| l | 4 |
| m | 1 |
| p | 2 |
| s | 4 |
Count number of unique characters in a string
There is no direct or easy way of doing it. You may need to write a store function to do the job and by looking at all the characters you may expect in the data. Here is an example for just digits , which could be extended for all the characters in a stored function
mysql> select * from test ;
+------------+
| val |
+------------+
| 11111111 |
| 111222222 |
| 1113333222 |
+------------+
select
val,
sum(case when locate('1',val) > 0 then 1 else 0 end )
+ sum( case when locate('2',val) > 0 then 1 else 0 end)
+ sum(case when locate('3',val) > 0 then 1 else 0 end)
+sum(case when locate('4',val) > 0 then 1 else 0 end ) as occurence
from test group by val
+------------+-----------+
| val | occurence |
+------------+-----------+
| 11111111 | 1 |
| 111222222 | 2 |
| 1113333222 | 3 |
+------------+-----------+
Or if you have enough time , create a lookup table with all the characters you could think of. And make the query in 2 lines
mysql> select * from test ;
+------------+
| val |
+------------+
| 11111111 |
| 111222222 |
| 1113333222 |
+------------+
3 rows in set (0.00 sec)
mysql> select * from look_up ;
+------+------+
| id | val |
+------+------+
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
+------+------+
4 rows in set (0.00 sec)
select
t1.val,
sum(case when locate(t2.val,t1.val) > 0 then 1 else 0 end ) as occ
from test t1,(select * from look_up)t2
group by t1.val ;
+------------+------+
| val | occ |
+------------+------+
| 11111111 | 1 |
| 111222222 | 2 |
| 1113333222 | 3 |
+------------+------+
T-SQL - Count unique characters in a variable
Using NGrams8K
as a base, you can change the input parameter to a nvarchar(4000)
and tweak the DATALENGTH
, making NGramsN4K
. Then you can use that to split the string into individual characters and count them:
SELECT COUNT(DISTINCT NG.token) AS DistinctCharacters
FROM dbo.NGramsN4k(@String1,1) NG;
Altered NGrams8K
:
IF OBJECT_ID('dbo.NGramsN4k','IF') IS NOT NULL DROP FUNCTION dbo.NGramsN4k;
GO
CREATE FUNCTION dbo.NGramsN4k
(
@string nvarchar(4000), -- Input string
@N int -- requested token size
)
/****************************************************************************************
Purpose:
A character-level N-Grams function that outputs a contiguous stream of @N-sized tokens
based on an input string (@string). Accepts strings up to 8000 varchar characters long.
For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram.
Compatibility:
SQL Server 2008+, Azure SQL Database
Syntax:
--===== Autonomous
SELECT position, token FROM dbo.NGrams8k(@string,@N);
--===== Against a table using APPLY
SELECT s.SomeID, ng.position, ng.token
FROM dbo.SomeTable s
CROSS APPLY dbo.NGrams8K(s.SomeValue,@N) ng;
Parameters:
@string = The input string to split into tokens.
@N = The size of each token returned.
Returns:
Position = bigint; the position of the token in the input string
token = varchar(8000); a @N-sized character-level N-Gram token
Developer Notes:
1. NGrams8k is not case sensitive
2. Many functions that use NGrams8k will see a huge performance gain when the optimizer
creates a parallel execution plan. One way to get a parallel query plan (if the
optimizer does not chose one) is to use make_parallel by Adam Machanic which can be
found here:
sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx
3. When @N is less than 1 or greater than the datalength of the input string then no
tokens (rows) are returned. If either @string or @N are NULL no rows are returned.
This is a debatable topic but the thinking behind this decision is that: because you
can't split 'xxx' into 4-grams, you can't split a NULL value into unigrams and you
can't turn anything into NULL-grams, no rows should be returned.
For people who would prefer that a NULL input forces the function to return a single
NULL output you could add this code to the end of the function:
UNION ALL
SELECT 1, NULL
WHERE NOT(@N > 0 AND @N <= DATALENGTH(@string)) OR (@N IS NULL OR @string IS NULL)
4. NGrams8k can also be used as a tally table with the position column being your "N"
row. To do so use REPLICATE to create an imaginary string, then use NGrams8k to split
it into unigrams then only return the position column. NGrams8k will get you up to
8000 numbers. There will be no performance penalty for sorting by position in
ascending order but there is for sorting in descending order. To get the numbers in
descending order without forcing a sort in the query plan use the following formula:
N = <highest number>-position+1.
Pseudo Tally Table Examples:
--===== (1) Get the numbers 1 to 100 in ascending order:
SELECT N = position
FROM dbo.NGrams8k(REPLICATE(0,100),1);
--===== (2) Get the numbers 1 to 100 in descending order:
DECLARE @maxN int = 100;
SELECT N = @maxN-position+1
FROM dbo.NGrams8k(REPLICATE(0,@maxN),1)
ORDER BY position;
5. NGrams8k is deterministic. For more about deterministic functions see:
https://msdn.microsoft.com/en-us/library/ms178091.aspx
Usage Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
SELECT position, token FROM dbo.NGrams8k('abcd',1); -- unigrams (@N=1)
SELECT position, token FROM dbo.NGrams8k('abcd',2); -- bigrams (@N=2)
SELECT position, token FROM dbo.NGrams8k('abcd',3); -- trigrams (@N=3)
--===== How many times the substring "AB" appears in each record
DECLARE @table TABLE(stringID int identity primary key, string varchar(100));
INSERT @table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');
SELECT string, occurances = COUNT(*)
FROM @table t
CROSS APPLY dbo.NGrams8k(t.string,2) ng
WHERE ng.token = 'AB'
GROUP BY string;
----------------------------------------------------------------------------------------
Revision History:
Rev 00 - 20140310 - Initial Development - Alan Burstein
Rev 01 - 20150522 - Removed DQS N-Grams functionality, improved iTally logic. Also Added
conversion to bigint in the TOP logic to remove implicit conversion
to bigint - Alan Burstein
Rev 03 - 20150909 - Added logic to only return values if @N is greater than 0 and less
than the length of @string. Updated comment section. - Alan Burstein
Rev 04 - 20151029 - Added ISNULL logic to the TOP clause for the @string and @N
parameters to prevent a NULL string or NULL @N from causing "an
improper value" being passed to the TOP clause. - Alan Burstein
****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH
L1(N) AS
(
SELECT 1
FROM (VALUES -- 90 NULL values used to create the CTE Tally table
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)
) t(N)
),
iTally(N) AS -- my cte Tally table
(
SELECT TOP(ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(@string,N''))/2)-(ISNULL(@N,1)-1)),0)))
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
FROM L1 a CROSS JOIN L1 b -- cartesian product for 8100 rows (90^2)
)
SELECT
position = N, -- position of the token in the string(s)
token = SUBSTRING(@string,CAST(N AS int),@N) -- the @N-Sized token
FROM iTally
WHERE @N > 0 AND @N <= (DATALENGTH(@string)/2); -- Protection against bad parameter values
How to get all distinct words of a specified minimum length from multiple columns in a MySQL table?
Shell script might be efficient...
SELECT CONCAT_WS(' ', col_a, col_b, col_c) INTO OUTFILE 'x' ...
to get the columns into a filetr ' ' "\n" <x
-- split into one word per lineawk 'length($1) >= 5'
-- minimum size of 5 characters per wordsort -u
-- to dedup
There are no stopwords, but sed or awk could deal with that.
mysql -e "SELECT ... INTO OUTFILE 'x' ..." ...
tr ' ' "\n" <x | awk 'length($1) >= 5' | sort -u
SQL DISTINCT values across rows
Try with this little change in your query:
select agreementnumber,
max(case when rn = 1 then telephone end) telephone1,
max(case when rn = 2 then telephone end) telephone2,
max(case when rn = 3 then telephone end) telephone3,
max(case when rn = 4 then telephone end) telephone4,
max(case when rn = 5 then telephone end) telephone5
from
(
select x.*,
row_number() over(partition by x.agreementnumber order by x.telephone) rn
from (
select distinct agreementnumber, telephone
from alternate_mobile
) x
) src
group by agreementnumber;
If you were getting duplicate telephones is because you have duplicated agreementnumber
/telephone
in your alternate_mobile
table.
Edited:
I change the query to keep just numbers in the telephone, removing all the rest of characters:
select agreementnumber,
max(case when rn = 1 then telephone end) telephone1,
max(case when rn = 2 then telephone end) telephone2,
max(case when rn = 3 then telephone end) telephone3,
max(case when rn = 4 then telephone end) telephone4,
max(case when rn = 5 then telephone end) telephone5
from
(
select x.*,
row_number() over(partition by x.agreementnumber order by x.telephone) rn
from (
select distinct agreementnumber, regexp_replace(telephone,'[^0-9]', '') as telephone
from alternate_mobile
) x
) src
group by agreementnumber;
How do I get distinct characters of string column in mssql?
DECLARE @result VARCHAR(MAX)
SET @result = ''
DECLARE @t TABLE(name VARCHAR(400))
INSERT INTO @t
SELECT 'Josef Knoller' UNION ALL SELECT 'Josef Somos' UNION ALL SELECT 'KFZ Wiesauer'
;WITH
L0 AS (SELECT 1 AS c UNION ALL SELECT 1),
L1 AS (SELECT 1 AS c FROM L0 A CROSS JOIN L0 B),
L2 AS (SELECT 1 AS c FROM L1 A CROSS JOIN L1 B),
L3 AS (SELECT 1 AS c FROM L2 A CROSS JOIN L2 B),
L4 AS (SELECT 1 AS c FROM L3 A CROSS JOIN L3 B),
Nums AS (SELECT ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS i FROM L4),
FilteredNums AS (SELECT i FROM Nums WHERE i<= 400),
Letters AS(
SELECT UPPER(SUBSTRING(name,i,1)) AS L, ROW_NUMBER() OVER (ORDER BY name,i) AS RN
FROM @t
JOIN FilteredNums ON FilteredNums.i <= LEN(name))
SELECT @result = @result + L
FROM Letters
GROUP BY L
ORDER BY MIN(RN)
SELECT @result
Related Topics
MySQL Correlated Subquery in Join Syntax
What Is and Examples of Using Data Type - References
Pure-SQL Technique for Auto-Numbering Rows in Result Set
Returning The First X Records in a Postgresql Query with a Unique Field
How to Get Records Before and After Given One
What Can Happen as a Result of Using (Nolock) on Every Select in SQL Server
Count of Unique Values in a Rolling Date Range for R
Pl/Sql- Get Column Names from a Query
Rails 3 Sum Product of Two Fields
What Data Can Be Stored in Varbinary Data Type of SQL Server
Ruby Activerecord and SQL Tuple Support
Sort by Day of The Week from Monday to Sunday
Sql: Subquery Has Too Many Columns
How to Retrieve a Column Value by Name Using Golang Database/Sql
Rails Brakeman Warning of SQL Injection