Oracle SQL: the Insert Query with Regexp_Substr Expression Is Very Long ( Split String )

Sql query with regexp_substr is slow and timing out once more records are inserted in table

Check if one value is a sub-string of the other:

select *
from   cust_bug_data cbd
where  EXISTS (
  SELECT 1
  FROM   cbcm
  WHERE  cbcm.REPORT_NAME='SE_SUPP2'
  AND    ',' || cbcm.WHERE_CLAUSE || ',' LIKE '%,' || cbd.id || ',%'
)

Outputs:


      ID | NAME
-------: | :---
29786399 | test

db<>fiddle here

Or don't store delimited strings in your database:

CREATE TABLE cbcm(
  REPORT_NAME varchar2(30) PRIMARY KEY
);

CREATE TABLE cbcm_where(
  REPORT_NAME  varchar2(30) REFERENCES cbcm ( REPORT_NAME ),
  WHERE_CLAUSE integer,
  PRIMARY KEY ( REPORT_NAME, WHERE_CLAUSE )
);

CREATE TABLE cust_bug_data(id integer, name varchar2(20));

insert into  cbcm (REPORT_NAME ) values('SE_SUPP2');

insert into  cust_bug_data VALUES ( 29786399, 'test');

Split the list up when you insert it (here is a method that uses faster string functions rather than slow regular expressions):

insert into cbcm_where ( REPORT_NAME, WHERE_CLAUSE )
WITH list_to_insert ( report_name, list ) AS (
  SELECT 'SE_SUPP2', '29786399,29271272,29815958,29821597,29821140,29821791,29850566' FROM DUAL
),
bounds ( report_name, list, startidx, endidx ) AS (
  SELECT report_name,
         list,
         1,
         INSTR( list, ',', 1 )
  FROM   list_to_insert
UNION ALL
  SELECT report_name,
         list,
         endidx + 1,
         INSTR( list, ',', endidx + 1 )
  FROM   bounds
  WHERE  endidx > 0
)
SELECT report_name,
       TO_NUMBER(
         CASE
         WHEN endidx = 0
         THEN SUBSTR( list, startidx )
         ELSE SUBSTR( list, startidx, endidx - startidx )
         END
       )
FROM   bounds

Then query your data:

select *
from   cust_bug_data cbd
where  EXISTS (
  SELECT 1
  FROM   cbcm_where cw
  WHERE  cw.REPORT_NAME='SE_SUPP2'
  AND    cw.WHERE_CLAUSE = cbd.id
)


      ID | NAME
-------: | :---
29786399 | test

db<>fiddle here

split string into several rows

From your comment to @PrzemyslawKruglej answer

Main problem is with internal query with connect by, it generates astonishing amount of rows

The amount of rows generated can be reduced with the following approach:

/* test table populated with sample data from your question */
SQL> create table t1(str) as(
  2    select 'a;b;c'  from dual union all
  3    select 'b;c;d'  from dual union all
  4    select 'a;c;d'  from dual
  5  );
Table created

--  number of rows generated will solely depend on the most longest 
--  string. 
--  If (say) the longest string contains 3 words (wont count separator `;`)
--  and we have 100 rows in our table, then we will end up with 300 rows 
--  for further processing , no more.
with occurrence(ocr) as( 
  select level 
    from ( select max(regexp_count(str, '[^;]+')) as mx_t
             from t1 ) t
    connect by level <= mx_t 
)
select count(regexp_substr(t1.str, '[^;]+', 1, o.ocr)) as generated_for_3_rows
  from t1
 cross join occurrence o;

Result: For three rows where the longest one is made up of three words, we will generate 9 rows:

GENERATED_FOR_3_ROWS
--------------------
                  9

Final query:

with occurrence(ocr) as( 
  select level 
    from ( select max(regexp_count(str, '[^;]+')) as mx_t
             from t1 ) t
    connect by level <= mx_t 
)
select res
     , count(res) as cnt
  from (select regexp_substr(t1.str, '[^;]+', 1, o.ocr) as res
          from t1
         cross join occurrence o)
 where res is not null
 group by res
 order by res;

Result:

RES          CNT
----- ----------
a              2
b              2
c              3
d              2

SQLFIddle Demo

Find out more about regexp_count()(11g and up) and regexp_substr() regular expression functions.

Note: Regular expression functions relatively expensive to compute, and when it comes to processing a very large amount of data, it might be worth considering to switch to a plain PL/SQL. Here is an example.

Oracle sql regular expression to split a string by a pattern

For example:

SQL> with test (email) as
  2    (select 'abcd@gmail.com' from dual union
  3     select 'bdef@gmail.com abdfgr@yahoo.com' from dual
  4    )
  5  select ltrim(regexp_substr(email, '@(\w+\.\w+)', 1, column_value), '@') res
  6  from test,
  7       table(cast(multiset(select level from dual
  8                           connect by level <= regexp_count(email, '@' )
  9                          ) as sys.odcinumberlist));

RES
-----------------------------------------------------------------------------------

gmail.com
gmail.com
yahoo.com

SQL>

Why this regexp in Oracle SQL loses the beginning of string?

Allow the . pattern to match all characters by adding the n flag to the regular expression:

WITH params ( p_value, p_delimiter ) AS (
  SELECT 'ab' || chr(10) || 'cd', 'xxx' FROM dual
)          
SELECT REGEXP_SUBSTR(p_value, '(.*?)(' || p_delimeter || '|$)', 1, level, 'cn', 1) AS CUT
  FROM params
CONNECT BY LEVEL < REGEXP_COUNT( p_value, '(.*?)(' || p_delimeter || '|$)' );

or you can use a simple function:

Oracle Setup:

CREATE TYPE VARCHAR2_TABLE AS TABLE OF VARCHAR2(4000);
/

CREATE OR REPLACE FUNCTION split_String(
  i_str    IN  VARCHAR2,
  i_delim  IN  VARCHAR2 DEFAULT ','
) RETURN VARCHAR2_TABLE DETERMINISTIC
AS
  p_result       VARCHAR2_TABLE := VARCHAR2_TABLE();
  p_start        NUMBER(5) := 1;
  p_end          NUMBER(5);
  c_len CONSTANT NUMBER(5) := LENGTH( i_str );
  c_ld  CONSTANT NUMBER(5) := LENGTH( i_delim );
BEGIN
  IF c_len > 0 THEN
    p_end := INSTR( i_str, i_delim, p_start );
    WHILE p_end > 0 LOOP
      p_result.EXTEND;
      p_result( p_result.COUNT ) := SUBSTR( i_str, p_start, p_end - p_start );
      p_start := p_end + c_ld;
      p_end := INSTR( i_str, i_delim, p_start );
    END LOOP;
    IF p_start <= c_len + 1 THEN
      p_result.EXTEND;
      p_result( p_result.COUNT ) := SUBSTR( i_str, p_start, c_len - p_start + 1 );
    END IF;
  END IF;
  RETURN p_result;
END;
/

Query:

WITH params ( p_value, p_delimiter ) AS (
  SELECT 'ab' || chr(10) || 'cd', 'xxx' FROM dual
)          
SELECT COLUMN_VALUE AS CUT
FROM   params,
       TABLE( split_String( p_value, p_delimiter ) );

Oracle SQL Select a Variable and split it by semicolon

OK, you have semi-colon separated list of values. You said that you want to have them in different rows, but - that's not what example shows ... this:

I would like it to look like this when using Select:

test test1 test2 test3

is only one row, with space as a separator.

Anyway: presuming that you really want different rows, then replace current separator with a line feed character (chr(10)), e.g.

SQL> select replace('test;test1;test2;test3', ';', chr(10)) result
  2  from dual;

RESULT
----------------------
test
test1
test2
test3

SQL>

As it turns out you need different columns after all, then - with such a sample data - regular expressions are a simple solution:

SQL> with test (col) as
  2    (select 'test;test1;test2;test3' from dual)
  3  select regexp_substr(col, '\w+', 1, 1) col1,
  4         regexp_substr(col, '\w+', 1, 2) col2,
  5         regexp_substr(col, '\w+', 1, 3) col3,
  6         regexp_substr(col, '\w+', 1, 4) col4
  7  from test;

COL1 COL2  COL3  COL4
---- ----- ----- -----
test test1 test2 test3

SQL>

Join to splitted string columns in Oracle

I think this is the query you want:

select gch.Line, gch.productB, gch.productC,
       REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 4)
from  products p inner join
      lineitems gch 
      on gch.Line = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 1) and
         gch.productB = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 2) and
         gch.productC = REGEXP_SUBSTR(p.delimited, '[^\]+', 1, 3)
where p.productid = 1;

You need neither a subquery nor a temporary table.

Performance and Readability of REGEXP_SUBSTR vs INSTR and SUBSTR

I already posted an answer showing how to solve this problem using INSTR and SUBSTR the right way.

In this "Answer" I address the other question - which solution is more efficient. I will explain the test below, but here is the bottom line: the REGEXP solution takes 40 times longer than the INSTR/SUBSTR solution.

Setup: I created a table with 1.5 million random strings (all exactly eight characters long, all upper-case letters). Then I modified 10% of the strings to add the substring 'PLE', another 10% to add a '#' and another 10% to add 'ALL'. I did this by splitting an original string at position mod(rownum, 9) - that is a number between 0 and 8 - and concatenating 'PLE' or '#' or 'ALL' at that position. Granted, not the most efficient or elegant way to get the kind of test data we needed, but that is irrelevant - the point is just to create the test data and use it in our tests.

So: we now have a table with just one column, data1, with some random strings in 1.5 million rows. 10% each have the substring PLE or # or ALL in them.

The test consists in creating the new string data2 as in the original post. I am not inserting the result back in the table; regardless of how data2 is calculated, the time to insert it back in the table should be the same.

Instead, I put the main query inside an outer one that computes the sum of the lengths of the resulting data2 values. This way I guarantee the optimizer can't take shortcuts: all data2 values must be generated, their lengths must be measured, and then summed together.

Below are the statements needed to create the base table, which I called table_z, then the queries I ran.

create table table_z as
select dbms_random.string('U', 8) as data1 from dual
connect by level <= 1500000;

update table_z 
set data1 = case
when rownum between      1 and 150000 then substr(data1, 1, mod(rownum, 9)) 
                               || 'PLE' || substr(data1, mod(rownum, 9) + 1)
when rownum between 150001 and 300000 then substr(data1, 1, mod(rownum, 9)) 
                               || '#'   || substr(data1, mod(rownum, 9) + 1)
when rownum between 300001 and 450000 then substr(data1, 1, mod(rownum, 9)) 
                               || 'ALL' || substr(data1, mod(rownum, 9) + 1)
          end
where rownum <= 450000;

commit;

INSTR/SUBSTR solution

select sum(length(data2))
from (
select data1, 
       case 
         when instr(data1, 'PLE', 2) > 0 then substr(data1, 1, instr(data1, 'PLE', 2) - 1)
         when instr(data1, '#'  , 2) > 0 then substr(data1, 1, instr(data1, '#'  , 2) - 1)
         when instr(data1, 'ALL', 2) > 0 then substr(data1, 1, instr(data1, 'ALL', 2) - 1)
         else data1 end
       as data2
from   table_z
);

SUM(LENGTH(DATA2))
------------------
          10713352

1 row selected.

Elapsed: 00:00:00.73

REGEXP solution

select sum(length(data2))
from (
select data1, 
       COALESCE(REGEXP_SUBSTR(DATA1, '(.+?)PLE',1,1,null,1)
                            ,REGEXP_SUBSTR(DATA1, '(.+?)#',1,1,null,1)
                            ,REGEXP_SUBSTR(DATA1, '(.+?)ALL',1,1,null,1)
                            ,DATA1)
       as data2
from   table_z
);

SUM(LENGTH(DATA2))
------------------
          10713352

1 row selected.

Elapsed: 00:00:30.75

Before anyone suggests these things: I repeated both queries several times; the first solution always runs in 0.75 to 0.80 seconds, the second query runs in 30 to 35 seconds. More than 40 times slower. (So it is not a matter of the compiler/optimizer spending time to compile the query; it is really the execution time.) Also, this has nothing to do with reading the 1.5 million values from the base table - that is the same in both tests, and it takes far less time than the processing. In any case, I ran the INSTR/SUBSTR query first, so if there was any caching, the REGEXP query would have been the one to benefit.

Edit: I just figured out one inefficiency in the proposed REGEXP solution. If we anchor the search pattern to the beginning of the string (for example '^(.+?)PLE', notice the ^ anchor), the runtime for the REGEXP query drops from 30 seconds to 10 seconds. Apparently the Oracle implementation isn't smart enough to recognize this equivalence and tries searches from the second character, from the third, etc. Still the execution time is almost 15 times longer; 15 < 40 but that is still a very large difference.

How to migrate data from the old table to the new table in oracle by removing the comma from the column values

I found the answer to the question. The below query did work for me.

insert into New_Table (ID, Structure)
(select ID, trim(Structure) 
from (
WITH TT AS
 (SELECT ID, Structure FROM Old_Table)
 SELECT ID, substr(str,
                   instr(str, ',', 1, LEVEL) + 1,
                   instr(str, ',', 1, LEVEL + 1) -
                   instr(str, ',', 1, LEVEL) - 1) Structure
     FROM   (SELECT ID, rownum AS r,
                    ','|| Structure||',' AS STR
               FROM   TT )
     CONNECT BY PRIOR r = r
         AND    instr(str, ',', 1, LEVEL + 1) > 0
         AND    PRIOR dbms_random.STRING('p', 10) IS NOT NULL)
         where trim(Structure) is not null);