Sql query with regexp_substr is slow and timing out once more records are inserted in table
Check if one value is a sub-string of the other:
select *
from cust_bug_data cbd
where EXISTS (
SELECT 1
FROM cbcm
WHERE cbcm.REPORT_NAME='SE_SUPP2'
AND ',' || cbcm.WHERE_CLAUSE || ',' LIKE '%,' || cbd.id || ',%'
)
Outputs:
ID | NAME
-------: | :---
29786399 | test
db<>fiddle here
Or don't store delimited strings in your database:
CREATE TABLE cbcm(
REPORT_NAME varchar2(30) PRIMARY KEY
);
CREATE TABLE cbcm_where(
REPORT_NAME varchar2(30) REFERENCES cbcm ( REPORT_NAME ),
WHERE_CLAUSE integer,
PRIMARY KEY ( REPORT_NAME, WHERE_CLAUSE )
);
CREATE TABLE cust_bug_data(id integer, name varchar2(20));
insert into cbcm (REPORT_NAME ) values('SE_SUPP2');
insert into cust_bug_data VALUES ( 29786399, 'test');
Split the list up when you insert it (here is a method that uses faster string functions rather than slow regular expressions):
insert into cbcm_where ( REPORT_NAME, WHERE_CLAUSE )
WITH list_to_insert ( report_name, list ) AS (
SELECT 'SE_SUPP2', '29786399,29271272,29815958,29821597,29821140,29821791,29850566' FROM DUAL
),
bounds ( report_name, list, startidx, endidx ) AS (
SELECT report_name,
list,
1,
INSTR( list, ',', 1 )
FROM list_to_insert
UNION ALL
SELECT report_name,
list,
endidx + 1,
INSTR( list, ',', endidx + 1 )
FROM bounds
WHERE endidx > 0
)
SELECT report_name,
TO_NUMBER(
CASE
WHEN endidx = 0
THEN SUBSTR( list, startidx )
ELSE SUBSTR( list, startidx, endidx - startidx )
END
)
FROM bounds
Then query your data:
select *
from cust_bug_data cbd
where EXISTS (
SELECT 1
FROM cbcm_where cw
WHERE cw.REPORT_NAME='SE_SUPP2'
AND cw.WHERE_CLAUSE = cbd.id
)
ID | NAME
-------: | :---
29786399 | test
db<>fiddle here
split string into several rows
From your comment to @PrzemyslawKruglej answer
Main problem is with internal query with
connect by
, it generates astonishing amount of rows
The amount of rows generated can be reduced with the following approach:
/* test table populated with sample data from your question */
SQL> create table t1(str) as(
2 select 'a;b;c' from dual union all
3 select 'b;c;d' from dual union all
4 select 'a;c;d' from dual
5 );
Table created
-- number of rows generated will solely depend on the most longest
-- string.
-- If (say) the longest string contains 3 words (wont count separator `;`)
-- and we have 100 rows in our table, then we will end up with 300 rows
-- for further processing , no more.
with occurrence(ocr) as(
select level
from ( select max(regexp_count(str, '[^;]+')) as mx_t
from t1 ) t
connect by level <= mx_t
)
select count(regexp_substr(t1.str, '[^;]+', 1, o.ocr)) as generated_for_3_rows
from t1
cross join occurrence o;
Result: For three rows where the longest one is made up of three words, we will generate 9 rows:
GENERATED_FOR_3_ROWS
--------------------
9
Final query:
with occurrence(ocr) as(
select level
from ( select max(regexp_count(str, '[^;]+')) as mx_t
from t1 ) t
connect by level <= mx_t
)
select res
, count(res) as cnt
from (select regexp_substr(t1.str, '[^;]+', 1, o.ocr) as res
from t1
cross join occurrence o)
where res is not null
group by res
order by res;
Result:
RES CNT
----- ----------
a 2
b 2
c 3
d 2
SQLFIddle Demo
Find out more about regexp_count()(11g and up) and regexp_substr() regular expression functions.
Note: Regular expression functions relatively expensive to compute, and when it comes to processing a very large amount of data, it might be worth considering to switch to a plain PL/SQL. Here is an example.
Oracle sql regular expression to split a string by a pattern
For example:
SQL> with test (email) as
2 (select 'abcd@gmail.com' from dual union
3 select 'bdef@gmail.com abdfgr@yahoo.com' from dual
4 )
5 select ltrim(regexp_substr(email, '@(\w+\.\w+)', 1, column_value), '@') res
6 from test,
7 table(cast(multiset(select level from dual
8 connect by level <= regexp_count(email, '@' )
9 ) as sys.odcinumberlist));
RES
-----------------------------------------------------------------------------------
gmail.com
gmail.com
yahoo.com
SQL>
Why this regexp in Oracle SQL loses the beginning of string?
Allow the .
pattern to match all characters by adding the n
flag to the regular expression:
WITH params ( p_value, p_delimiter ) AS (
SELECT 'ab' || chr(10) || 'cd', 'xxx' FROM dual
)
SELECT REGEXP_SUBSTR(p_value, '(.*?)(' || p_delimeter || '|$)', 1, level, 'cn', 1) AS CUT
FROM params
CONNECT BY LEVEL < REGEXP_COUNT( p_value, '(.*?)(' || p_delimeter || '|$)' );
or you can use a simple function:
Oracle Setup:
CREATE TYPE VARCHAR2_TABLE AS TABLE OF VARCHAR2(4000);
/
CREATE OR REPLACE FUNCTION split_String(
i_str IN VARCHAR2,
i_delim IN VARCHAR2 DEFAULT ','
) RETURN VARCHAR2_TABLE DETERMINISTIC
AS
p_result VARCHAR2_TABLE := VARCHAR2_TABLE();
p_start NUMBER(5) := 1;
p_end NUMBER(5);
c_len CONSTANT NUMBER(5) := LENGTH( i_str );
c_ld CONSTANT NUMBER(5) := LENGTH( i_delim );
BEGIN
IF c_len > 0 THEN
p_end := INSTR( i_str, i_delim, p_start );
WHILE p_end > 0 LOOP
p_result.EXTEND;
p_result( p_result.COUNT ) := SUBSTR( i_str, p_start, p_end - p_start );
p_start := p_end + c_ld;
p_end := INSTR( i_str, i_delim, p_start );
END LOOP;
IF p_start <= c_len + 1 THEN
p_result.EXTEND;
p_result( p_result.COUNT ) := SUBSTR( i_str, p_start, c_len - p_start + 1 );
END IF;
END IF;
RETURN p_result;
END;
/
Query:
WITH params ( p_value, p_delimiter ) AS (
SELECT 'ab' || chr(10) || 'cd', 'xxx' FROM dual
)
SELECT COLUMN_VALUE AS CUT
FROM params,
TABLE( split_String( p_value, p_delimiter ) );
Oracle SQL Select a Variable and split it by semicolon
OK, you have semi-colon separated list of values. You said that you want to have them in different rows, but - that's not what example shows ... this:
I would like it to look like this when using Select:
test test1 test2 test3
is only one row, with space as a separator.
Anyway: presuming that you really want different rows, then replace current separator with a line feed character (chr(10)), e.g.
SQL> select replace('test;test1;test2;test3', ';', chr(10)) result
2 from dual;
RESULT
----------------------
test
test1
test2
test3
SQL>
As it turns out you need different columns after all, then - with such a sample data - regular expressions are a simple solution:
SQL> with test (col) as
2 (select 'test;test1;test2;test3' from dual)
3 select regexp_substr(col, '\w+', 1, 1) col1,
4 regexp_substr(col, '\w+', 1, 2) col2,
5 regexp_substr(col, '\w+', 1, 3) col3,
6 regexp_substr(col, '\w+', 1, 4) col4
7 from test;
COL1 COL2 COL3 COL4
---- ----- ----- -----
test test1 test2 test3
SQL>
Join to splitted string columns in Oracle
I think this is the query you want:
select gch.Line, gch.productB, gch.productC,
REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 4)
from products p inner join
lineitems gch
on gch.Line = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 1) and
gch.productB = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 2) and
gch.productC = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 3)
where p.productid = 1;
You need neither a subquery nor a temporary table.
Performance and Readability of REGEXP_SUBSTR vs INSTR and SUBSTR
I already posted an answer showing how to solve this problem using INSTR
and SUBSTR
the right way.
In this "Answer" I address the other question - which solution is more efficient. I will explain the test below, but here is the bottom line: the REGEXP
solution takes 40 times longer than the INSTR/SUBSTR
solution.
Setup: I created a table with 1.5 million random strings (all exactly eight characters long, all upper-case letters). Then I modified 10% of the strings to add the substring 'PLE'
, another 10% to add a '#'
and another 10% to add 'ALL'
. I did this by splitting an original string at position mod(rownum, 9)
- that is a number between 0 and 8 - and concatenating 'PLE'
or '#'
or 'ALL'
at that position. Granted, not the most efficient or elegant way to get the kind of test data we needed, but that is irrelevant - the point is just to create the test data and use it in our tests.
So: we now have a table with just one column, data1
, with some random strings in 1.5 million rows. 10% each have the substring PLE
or #
or ALL
in them.
The test consists in creating the new string data2
as in the original post. I am not inserting the result back in the table; regardless of how data2
is calculated, the time to insert it back in the table should be the same.
Instead, I put the main query inside an outer one that computes the sum of the lengths of the resulting data2
values. This way I guarantee the optimizer can't take shortcuts: all data2
values must be generated, their lengths must be measured, and then summed together.
Below are the statements needed to create the base table, which I called table_z
, then the queries I ran.
create table table_z as
select dbms_random.string('U', 8) as data1 from dual
connect by level <= 1500000;
update table_z
set data1 = case
when rownum between 1 and 150000 then substr(data1, 1, mod(rownum, 9))
|| 'PLE' || substr(data1, mod(rownum, 9) + 1)
when rownum between 150001 and 300000 then substr(data1, 1, mod(rownum, 9))
|| '#' || substr(data1, mod(rownum, 9) + 1)
when rownum between 300001 and 450000 then substr(data1, 1, mod(rownum, 9))
|| 'ALL' || substr(data1, mod(rownum, 9) + 1)
end
where rownum <= 450000;
commit;
INSTR/SUBSTR
solution
select sum(length(data2))
from (
select data1,
case
when instr(data1, 'PLE', 2) > 0 then substr(data1, 1, instr(data1, 'PLE', 2) - 1)
when instr(data1, '#' , 2) > 0 then substr(data1, 1, instr(data1, '#' , 2) - 1)
when instr(data1, 'ALL', 2) > 0 then substr(data1, 1, instr(data1, 'ALL', 2) - 1)
else data1 end
as data2
from table_z
);
SUM(LENGTH(DATA2))
------------------
10713352
1 row selected.
Elapsed: 00:00:00.73
REGEXP
solution
select sum(length(data2))
from (
select data1,
COALESCE(REGEXP_SUBSTR(DATA1, '(.+?)PLE',1,1,null,1)
,REGEXP_SUBSTR(DATA1, '(.+?)#',1,1,null,1)
,REGEXP_SUBSTR(DATA1, '(.+?)ALL',1,1,null,1)
,DATA1)
as data2
from table_z
);
SUM(LENGTH(DATA2))
------------------
10713352
1 row selected.
Elapsed: 00:00:30.75
Before anyone suggests these things: I repeated both queries several times; the first solution always runs in 0.75 to 0.80 seconds, the second query runs in 30 to 35 seconds. More than 40 times slower. (So it is not a matter of the compiler/optimizer spending time to compile the query; it is really the execution time.) Also, this has nothing to do with reading the 1.5 million values from the base table - that is the same in both tests, and it takes far less time than the processing. In any case, I ran the INSTR/SUBSTR
query first, so if there was any caching, the REGEXP
query would have been the one to benefit.
Edit: I just figured out one inefficiency in the proposed REGEXP solution. If we anchor the search pattern to the beginning of the string (for example '^(.+?)PLE'
, notice the ^
anchor), the runtime for the REGEXP query drops from 30 seconds to 10 seconds. Apparently the Oracle implementation isn't smart enough to recognize this equivalence and tries searches from the second character, from the third, etc. Still the execution time is almost 15 times longer; 15 < 40 but that is still a very large difference.
How to migrate data from the old table to the new table in oracle by removing the comma from the column values
I found the answer to the question. The below query did work for me.
insert into New_Table (ID, Structure)
(select ID, trim(Structure)
from (
WITH TT AS
(SELECT ID, Structure FROM Old_Table)
SELECT ID, substr(str,
instr(str, ',', 1, LEVEL) + 1,
instr(str, ',', 1, LEVEL + 1) -
instr(str, ',', 1, LEVEL) - 1) Structure
FROM (SELECT ID, rownum AS r,
','|| Structure||',' AS STR
FROM TT )
CONNECT BY PRIOR r = r
AND instr(str, ',', 1, LEVEL + 1) > 0
AND PRIOR dbms_random.STRING('p', 10) IS NOT NULL)
where trim(Structure) is not null);
Related Topics
How to Pass a Parameter to a T-SQL Script
Referencing a Calculated Column in the Where Clause SQL
Sqlite Get Name of Attached Databases
Are There Downsides to Using Prepared Statements
How to List the Source Table Name of Columns in a View (SQL Server 2005)
Does Liquibase Support Dry Run
Detect Duplicate Items in Recursive Cte
There Is Already an Object Named '#Tmptable' in the Database
How to Calculate Balances in an Accounting Software Using Postgres Window Function
Sql: Try/Catch Doesn't Catch an Error When Attempting to Access a Table That It Can't Find
SQL Server - Does Trigger Affects @@Rowcount
Rounding to 2 Decimal Places in SQL
Using with Nolock Table Hint in Query Using View - Does It Propagate Within the View