Split String into Several Rows

Turning a Comma Separated string into individual rows

You can use the wonderful recursive functions from SQL Server:


Sample table:

CREATE TABLE Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);

INSERT Testdata SELECT 1, 9, '18,20,22';
INSERT Testdata SELECT 2, 8, '17,19';
INSERT Testdata SELECT 3, 7, '13,19,20';
INSERT Testdata SELECT 4, 6, '';
INSERT Testdata SELECT 9, 11, '1,2,3,4';

The query

WITH tmp(SomeID, OtherID, DataItem, String) AS
(
SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM Testdata
UNION all

SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM tmp
WHERE
String > ''
)
SELECT
SomeID,
OtherID,
DataItem
FROM tmp
ORDER BY SomeID;
-- OPTION (maxrecursion 0)
-- normally recursion is limited to 100. If you know you have very long
-- strings, uncomment the option

Output

 SomeID | OtherID | DataItem 
--------+---------+----------
1 | 9 | 18
1 | 9 | 20
1 | 9 | 22
2 | 8 | 17
2 | 8 | 19
3 | 7 | 13
3 | 7 | 19
3 | 7 | 20
4 | 6 |
9 | 11 | 1
9 | 11 | 2
9 | 11 | 3
9 | 11 | 4

Split string into multiple rows and columns

I rewrote this as a set based query instead of a cursor and this whole script, including the generation of 100k test records runs in about 40 seconds on my Azure SQL DB, which is a Serverless gen 5 with only 1 vCore. Work through the script to make sure you understand it.

NB I'm dropping the tables as this is a test rig - not production code:

------------------------------------------------------------------------------------------------
-- Setup START
------------------------------------------------------------------------------------------------

DROP TABLE IF EXISTS dbo.sourceTable
DROP TABLE IF EXISTS dbo.finalTable
GO

CREATE TABLE dbo.sourceTable
(
PersonId INT IDENTITY PRIMARY KEY,
LocationId INT,
StartDate DATE,
AttendanceString VARCHAR(1000)
)
GO

CREATE TABLE dbo.finalTable
(
DayDate DATE,
LocationId INT,
PersonId INT,
MorningValue CHAR(1),
AfternoonValue CHAR(1)
)
GO

-- Generate some test data
SET IDENTITY_INSERT dbo.sourceTable ON

INSERT INTO dbo.sourceTable ( PersonId, LocationId, StartDate, AttendanceString )
VALUES
( 123, 987, '2018-09-01', 'XXXXZZZZ######PPLL' ),
( 234, 678, '2018-10-01', 'PPPPLL######ZZZZXX' ),
( 567, 999, '2018-10-01', 'abcdefghijklmnopqr' )

SET IDENTITY_INSERT dbo.sourceTable OFF
GO

-- Setup END
------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------
-- Test Data START
------------------------------------------------------------------------------------------------

;WITH cte AS (
SELECT 1 rn, 1 locationId, CAST( '1 Jan 2018' AS DATE ) startDate, REPLACE( NEWID(), '-', '' ) AttendanceString
UNION ALL
SELECT rn + 1, rn % 42, DATEADD( day, 1, startDate ), REPLACE( NEWID(), '-', '' )
FROM cte
WHERE rn < 100
)
INSERT INTO dbo.sourceTable ( LocationId, StartDate, AttendanceString )
SELECT LocationId, StartDate, AttendanceString
FROM cte
ORDER BY 1;
GO 1000

-- Test Data END
------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------
-- Rewritten query START
------------------------------------------------------------------------------------------------

DROP TABLE IF EXISTS #tmp;

;WITH cte AS (
SELECT 1 n, 1 x, 2 y
UNION ALL
SELECT n + 1, x + 2, y + 2
FROM cte
WHERE n < 20
)
SELECT
personId,
locationId,
DATEADD( day, c.n - 1, startDate ) xdate,
SUBSTRING ( attendanceString, c.x, 1 ) a,
SUBSTRING ( attendanceString, c.y, 1 ) b

INTO #tmp

FROM dbo.sourceTable s
CROSS APPLY cte c
WHERE c.y <= LEN(attendanceString);

select *
from sourceTable
WHERE personId = 999

select *
from #tmp
WHERE personId = 999

select *
from #tmp
WHERE locationId = 999

-- Rewritten query END
------------------------------------------------------------------------------------------------

Altered version of the script for longer attendanceIds here.

Split String into rows but keep associated data

Here's how you would do it using STRING_SPLIT and CROSS APPLY. Keep in mind that STRING_SPLIT does not guarantee row ordering in the results.

DROP TABLE IF EXISTS #t1;
CREATE TABLE #t1(
[Name] VARCHAR(MAX),
[TimeStamp] VARCHAR(MAX),
[StudentIds] VARCHAR(MAX),
[Category] VARCHAR(MAX),
[ReportName] VARCHAR(MAX)
)
GO

INSERT INTO #t1 ([Name],[TimeStamp],[StudentIds],[Category],[ReportName])
SELECT 'Teacher Report Run', '2021-08-31 20:24:52.5430000', '111111', 'Teacher Reports', 'Report 1' UNION ALL
SELECT 'Teacher Report Run', '2021-08-30 1:01:22.1250000', '222222,333333,444444', 'Teacher Reports', 'Report 2'

SELECT #t1.Name, #t1.TimeStamp, sids.value StudentIds, #t1.Category, #t1.ReportName
FROM #t1
CROSS APPLY STRING_SPLIT(StudentIds, ',') sids

ORACLE: Splitting a string into multiple rows

From Oracle 12, you can use:

SELECT branch_code, year, month, day, holiday
FROM branches
CROSS JOIN LATERAL (
SELECT LEVEL AS day,
SUBSTR(holiday_list, LEVEL, 1) AS holiday
FROM DUAL
CONNECT BY LEVEL <= LENGTH(holiday_list)
)

Which, for the sample data:

CREATE TABLE branches (BRANCH_CODE, YEAR, MONTH, HOLIDAY_LIST) AS
SELECT '001', 2021, 1, 'HHHWWWHHHHWWWWWHHWWWWWHHWWWWWHH' FROM DUAL UNION ALL
SELECT '001', 2021, 2, 'WWWWWHHWWWWWHHWWWWWHHWHWWWHH' FROM DUAL

Outputs:























































































































BRANCH_CODEYEARMONTHDAYHOLIDAY
001202111H
001202112H
001202113H
001202114W
...............
0012021129W
0012021130H
0012021131H
001202121W
001202122W
001202123W
...............
0012021226W
0012021227H
0012021228H

Split delimited strings in multiple columns and separate them into rows

We may do this in an easier way if we make the delimiter same

library(dplyr)
library(tidyr)
library(stringr)
to_expand %>%
mutate(first = str_replace(first, "~", "|")) %>%
separate_rows(first, second, sep = "\\|")
# A tibble: 2 x 2
first second
<chr> <chr>
1 a 1~2~3
2 b 4~5~6

How to separate string into multiple rows in Pandas

Crete list of genres by split, repeat values by str.len and last flatten lists by chain.from_iterable:

from itertools import chain

genres = df['genres'].str.split('|')
df = pd.DataFrame({
'Name' : df['Name'].values.repeat(genres.str.len()),
'genres' : list(chain.from_iterable(genres.tolist()))
})

print (df)
Name genres
0 A Action
1 A Adventure
2 A Science Fiction
3 A Thriller
4 B Action
5 B Adventure
6 B Science Fiction
7 B Thriller
8 C Adventure
9 C Science Fiction
10 C Thriller

EDIT:

Solution for dynamic number of columns:

print (df)
Name genres col
0 A Action|Adventure|Science Fiction|Thriller 2
1 B Action|Adventure|Science Fiction|Thriller 3
2 C Adventure|Science Fiction|Thriller 5

from itertools import chain

cols = df.columns.difference(['genres'])
genres = df['genres'].str.split('|')

df = (df.loc[df.index.repeat(genres.str.len()), cols]
.assign(genres=list(chain.from_iterable(genres.tolist()))))
print (df)
Name col genres
0 A 2 Action
0 A 2 Adventure
0 A 2 Science Fiction
0 A 2 Thriller
1 B 3 Action
1 B 3 Adventure
1 B 3 Science Fiction
1 B 3 Thriller
2 C 5 Adventure
2 C 5 Science Fiction
2 C 5 Thriller

How to split string into multiple rows in a table using MYSQL?

One typical solution is to generate a list of numbers (by creating a table or using an inline query) and then JOIN it with the source data, using MySQL string functions to extract the relevant parts of serial_number.

Here is a solution that uses an inline query and that can handle up to 5 serials per record (to handle more serials, you would need to expand the subquery with more UNION ALLs):

SELECT
SUBSTRING_INDEX(
SUBSTRING_INDEX(box.serial_number, ',', num.n),
',',
-1
) new_serial_number,
box.height,
box.width,
box.weight
FROM
(
SELECT 1 n
UNION ALL SELECT 2
UNION ALL SELECT 3
UNION ALL SELECT 4
UNION ALL SELECT 5
) num
INNER JOIN box
ON CHAR_LENGTH(box.serial_number)
- CHAR_LENGTH(REPLACE(box.serial_number, ',', ''))
>= num.n - 1

This demo on DB Fiddle with your sample data returns:

| new_serial_number | height | width | weight |
| ----------------- | ------ | ----- | ------ |
| 1 | 5 | 2 | 100 |
| 2 | 5 | 2 | 100 |


Related Topics



Leave a reply



Submit