String Tokenizer for Oracle

UPDATED FUNCTION AT END OF ARTICLE RETURNS SEPARATORS

We often have need of string tokenizers in applications. Here is one for Oracle based on work by Pawel Barut.

 -- We need a type to hold the returned tokens
 --
 CREATE OR REPLACE TYPE T_TokenSet AS TABLE OF VARCHAR2(4000);
 /
 SHOW errors
 GRANT EXECUTE ON T_TokenSet TO public WITH GRANT OPTION;
 /*********************************************************************************
 ** @function   : Tokenizer
 ** @precis      : Splits any string into its tokens.
 ** @description : Supplied a string and a list of separators this function
 **                returns resultant tokens as a pipelined collection.
 ** @example     : SELECT t.column_value
 **                  FROM TABLE(tokenizer('The rain in spain, stays mainly on the plain.!',' ,.!') ) t; 
 ** @param       : p_string. The string to be Tokenized.
 ** @param       : p_separators. The characters that are used to split the string.
 ** @requires    : t_TokenSet type to be declared.
 ** @history     : Pawel Barut, http://pbarut.blogspot.com/2007/03/yet-another-tokenizer-in-oracle.html
 ** @history     : Simon Greener - July 2006 - Original coding (extended SQL sourced from a blog on the internet)
 **/
 CREATE OR REPLACE
   FUNCTION Tokenizer(p_string     IN VarChar2,
                      p_separators IN VarChar2 DEFAULT ' ')
     RETURN T_TokenSet Pipelined
   AS
     v_strs camod_common.T_TokenSet;
   BEGIN
     IF ( p_string IS NULL
          OR
          p_separators IS NULL ) THEN
        RETURN;
     END IF;
     WITH sel_string AS (SELECT p_string fullstring FROM dual)
     SELECT substr(fullstring, beg+1, end_p-beg-1) token
            Bulk Collect INTO v_strs
       FROM (SELECT beg, Lead(beg) OVER (ORDER BY beg) end_p, fullstring
               FROM (SELECT beg, fullstring
                       FROM (SELECT Level beg, fullstring
                               FROM sel_string
                             CONNECT BY Level <= LENGTH(fullstring)
                   )
                      WHERE instr(p_separators,substr(fullstring,beg,1)) >0
                     UNION ALL
                     SELECT 0, fullstring
                       FROM sel_string
                     UNION ALL
                     SELECT LENGTH(fullstring)+1, fullstring
                       FROM sel_string)
            )
      WHERE end_p IS NOT NULL
        AND end_p > beg + 1;
     FOR i IN v_strs.FIRST..v_strs.LAST Loop
       PIPE ROW(v_strs(i));
     END Loop;
     RETURN;
   END Tokenizer;

Here are my, simple, tests.

 SELECT DISTINCT t.column_value AS token
   FROM TABLE(Tokenizer('LineString:MultiLineString:MultiPoint:MultiPolygon:Point:Point:LineString:Polygon:Polygon',':')) t;

Result.

token
LineString
MultiLineString
MultiPoint
MultiPolygon
Point
Polygon
 SELECT t.column_value AS token
   FROM TABLE(tokenizer('The rain in spain, stays mainly on the plain.!',' ,.!')) t;

Result.

token
The
rain
in
spain
stays
mainly
on
the
plain

Updated Function

Sometimes it is more than handy to be able to access the separators in the position they are found in the output from a Tokenizer. Here is an updated version of the above that does this.

 -- New types
 --
 -- We need a type to hold the returned tokens
 --
 DROP TYPE T_Token    Force;
 DROP TYPE T_TokenSet Force;
 CREATE TYPE T_Token AS Object (
    id        INTEGER,
    token     varchar2(30000),
    separator varchar2(30000)
 );
 /
 SHOW errors
 GRANT EXECUTE ON T_Token TO public WITH GRANT OPTION;
 CREATE TYPE T_TokenSet AS TABLE OF codesys.t_token;
 /
 SHOW errors
 GRANT EXECUTE ON T_TokenSet TO public WITH GRANT OPTION;
 /*********************************************************************************
 ** @function   : Tokenizer
 ** @precis      : Splits any string into its tokens.
 ** @description : Supplied a string and a list of separators this function
 **                returns resultant tokens as a pipelined collection.
 ** @example     : SELECT t.column_value
 **                  FROM TABLE(tokenizer('The rain in spain, stays mainly on the plain.!',' ,.!') ) t;
 ** @param       : p_string. The string to be Tokenized.
 ** @param       : p_separators. The characters that are used to split the string.
 ** @requires    : t_TokenSet type to be declared.
 ** @history     : Pawel Barut, http://pbarut.blogspot.com/2007/03/yet-another-tokenizer-in-oracle.html
 ** @history     : Simon Greener - July 2006 - Original coding (extended SQL sourced from a blog on the internet)
 ** @history     : Simon Greener - Apr 2012  - Extended TO include returning OF tokens
 **/
 CREATE OR REPLACE
   FUNCTION Tokenizer(p_string     IN VarChar2,
                      p_separators IN VarChar2 DEFAULT ' ')
     RETURN T_TokenSet Pipelined
   AS
     v_tokens codesys.T_TokenSet;
   BEGIN
     IF ( p_string IS NULL
          OR
          p_separators IS NULL ) THEN
        RETURN;
     END IF;
     WITH myCTE AS (
        SELECT c.beg, c.sep, ROW_NUMBER() OVER(ORDER BY c.beg ASC) rid
          FROM (SELECT b.beg, c.sep
                  FROM (SELECT Level beg
                          FROM dual
                         CONNECT BY Level <= LENGTH(p_string)
                       ) b,
                       (SELECT SubStr(p_separators,level,1) AS sep
                         FROM dual
                         CONNECT BY Level <= LENGTH(p_separators)
                       ) c
                 WHERE instr(c.sep,substr(p_string,b.beg,1)) >0
                UNION ALL SELECT 0, CAST(NULL AS varchar2(10)) FROM dual
              ) c
     )
     SELECT T_Token(ROW_NUMBER() OVER (ORDER BY a.rid ASC),
                    CASE WHEN LENGTH(a.token) = 0 THEN NULL ELSE a.token END,
                    a.sep) AS token
       Bulk Collect INTO v_tokens
       FROM (SELECT d.rid,
                    SubStr(p_string,
                           (d.beg + 1),
                           NVL((Lead(d.beg,1) OVER (ORDER BY d.rid ASC) - d.beg - 1),LENGTH(p_string)) ) AS token,                    Lead(d.sep,1) OVER (ORDER BY d.rid ASC) AS sep
               FROM MyCTE d
            ) a
      WHERE LENGTH(a.token) <> 0 OR LENGTH(a.sep) <> 0;
     FOR v_i IN v_tokens.FIRST..v_tokens.LAST loop
        PIPE ROW(v_tokens(v_i));
     END LOOP;
     RETURN;
   END Tokenizer;

Testing

 SELECT DISTINCT t.token
  FROM TABLE(Tokenizer('LineString:MultiLineString:MultiPoint:MultiPolygon:Point:Point:LineString:Polygon:Polygon',':')) t;

Results

TOKEN
LineString
MultiLineString
MultiPoint
MultiPolygon
Point
Polygon

The classic “Rain in Spain…”.

 SELECT t.*
   FROM TABLE(tokenizer('The rain in spain, stays mainly on the plain.!',' ,.!')) t;

Results

ID TOKEN SEPARATOR
1 The {SPACE}
2 rain {SPACE}
3 in {SPACE}
4 spain ,
5 (null) {SPACE}
6 stays {SPACE}
7 mainly {SPACE}
8 on {SPACE}
9 the {SPACE}
10 plain .
11 (null) !

Now, let’s process a POLYGON WKT.

 SELECT t.id, t.token, t.separator
   FROM TABLE(tokenizer('POLYGON((2300 400, 2300 700, 2800 1100, 2300 1100, 1800 1100, 2300 400), (2300 1000, 2400  900, 2200 900, 2300 1000))',' ,()')) t;

Results

ID TOKEN SEPARATOR
1 POLYGON (
2 (null) (
3 2300 {SPACE}
4 400 ,
5 (null) {SPACE}
6 2300 {SPACE}
7 700 ,
8 (null) {SPACE}
9 2800 {SPACE}
10 1100 ,
11 (null) {SPACE}
12 2300 {SPACE}
13 1100 ,
14 (null) {SPACE}
15 1800 {SPACE}
16 1100 ,
17 (null) {SPACE}
18 2300 {SPACE}
19 400 )
20 (null) ,
21 (null) {SPACE}
22 (null) (
23 2300 {SPACE}
24 1000 ,
25 (null) {SPACE}
26 2400 {SPACE}
27 (null) {SPACE}
28 900 ,
29 (null) {SPACE}
30 2200 {SPACE}
31 900 ,
32 (null) {SPACE}
33 2300 {SPACE}
34 1000 )
35 (null) )

This time don’t include the space as a separator.

 SELECT t.id, t.token, t.separator
   FROM TABLE(tokenizer('POLYGON((2300 400, 2300 700, 2800 1100, 2300 1100, 1800 1100, 2300 400), (2300 1000, 2400  900, 2200 900, 2300 1000))',',()')) t;

Results

ID TOKEN SEPARATOR
1 POLYGON (
2 (null) (
3 2300 400 ,
4 2300 700 ,
5 2800 1100 ,
6 2300 1100 ,
7 1800 1100 ,
8 2300 400 )
9 (null) ,
10 {SPACE} (
11 2300 1000 ,
12 2400 900 ,
13 2200 900 ,
14 2300 1000 )
15 (null) )

I hope that someone out there finds this useful.code id=