⚠️ Warning: This is a draft ⚠️
This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.
If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.
{{task}} [[Category:String manipulation]] It is often necessary to split a string into pieces based on several different (potentially multi-character) separator strings, while still retaining the information about which separators were present in the input.
This is particularly useful when doing small parsing tasks.
The task is to write code to demonstrate this.
The function (or procedure or method, as appropriate) should take an input string and an ordered collection of separators.
The order of the separators is significant:
The delimiter order represents priority in matching, with the first defined delimiter having the highest priority. In cases where there would be an ambiguity as to which separator to use at a particular point (e.g., because one separator is a prefix of another) the separator with the highest priority should be used. Delimiters can be reused and the output from the function should be an ordered sequence of substrings.
Test your code using the input string “a!===b=!=c
” and the separators “==
”, “!=
” and “=
”.
For these inputs the string should be parsed as "a" (!=) "" (==) "b" (=) "" (!=) "c"
, where matched delimiters are shown in parentheses, and separated strings are quoted, so our resulting output is "a", empty string, "b", empty string, "c"
.
Note that the quotation marks are shown for clarity and do not form part of the output.
'''Extra Credit:''' provide information that indicates which separator was matched at each separation point and where in the input string that separator was matched.
Ada
multisplit.adb:
with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Ada.Text_IO;
procedure Multisplit is
package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists
(Element_Type => String);
use type String_Lists.Cursor;
function Split
(Source : String;
Separators : String_Lists.List)
return String_Lists.List
is
Result : String_Lists.List;
Next_Position : Natural := Source'First;
Prev_Position : Natural := Source'First;
Separator_Position : String_Lists.Cursor;
Separator_Length : Natural;
Changed : Boolean;
begin
loop
Changed := False;
Separator_Position := Separators.First;
while Separator_Position /= String_Lists.No_Element loop
Separator_Length :=
String_Lists.Element (Separator_Position)'Length;
if Next_Position + Separator_Length - 1 <= Source'Last
and then Source
(Next_Position .. Next_Position + Separator_Length - 1) =
String_Lists.Element (Separator_Position)
then
if Next_Position > Prev_Position then
Result.Append
(Source (Prev_Position .. Next_Position - 1));
end if;
Result.Append (String_Lists.Element (Separator_Position));
Next_Position := Next_Position + Separator_Length;
Prev_Position := Next_Position;
Changed := True;
exit;
end if;
Separator_Position := String_Lists.Next (Separator_Position);
end loop;
if not Changed then
Next_Position := Next_Position + 1;
end if;
if Next_Position > Source'Last then
Result.Append (Source (Prev_Position .. Source'Last));
exit;
end if;
end loop;
return Result;
end Split;
Test_Input : constant String := "a!===b=!=c";
Test_Separators : String_Lists.List;
Test_Result : String_Lists.List;
Pos : String_Lists.Cursor;
begin
Test_Separators.Append ("==");
Test_Separators.Append ("!=");
Test_Separators.Append ("=");
Test_Result := Split (Test_Input, Test_Separators);
Pos := Test_Result.First;
while Pos /= String_Lists.No_Element loop
Ada.Text_IO.Put (" " & String_Lists.Element (Pos));
Pos := String_Lists.Next (Pos);
end loop;
Ada.Text_IO.New_Line;
-- other order of separators
Test_Separators.Clear;
Test_Separators.Append ("=");
Test_Separators.Append ("!=");
Test_Separators.Append ("==");
Test_Result := Split (Test_Input, Test_Separators);
Pos := Test_Result.First;
while Pos /= String_Lists.No_Element loop
Ada.Text_IO.Put (" " & String_Lists.Element (Pos));
Pos := String_Lists.Next (Pos);
end loop;
end Multisplit;
{{out}}
a != == b = != c
a != = = b = != c
ALGOL 68
# split a string based on a number of separators #
# MODE to hold the split results #
MODE SPLITINFO = STRUCT( STRING text # delimited string, may be empty #
, INT position # starting position of the token #
, STRING delimiter # the delimiter that terminated the token #
);
# calculates the length of string s #
OP LENGTH = ( STRING s )INT: ( UPB s + 1 ) - LWB s;
# returns TRUE if s starts with p, FALSE otherwise #
PRIO STARTSWITH = 5;
OP STARTSWITH = ( STRING s, p )BOOL: IF LENGTH p > LENGTH s THEN FALSE ELSE s[ LWB s : ( LWB s + LENGTH p ) - 1 ] = p FI;
# returns an array of SPLITINFO describing the tokens in str based on the delimiters #
# zero-length delimiters are ignored #
PRIO SPLIT = 5;
OP SPLIT = ( STRING str, []STRING delimiters )[]SPLITINFO:
BEGIN
# count the number of tokens #
# allow there to be as many tokens as characters in the string + 2 #
# that would cater for a string composed of delimiters only #
[ 1 : ( UPB str + 3 ) - LWB str ]SPLITINFO tokens;
INT token count := 0;
INT str pos := LWB str;
INT str max = UPB str;
BOOL token pending := FALSE;
# construct the tokens #
str pos := LWB str;
INT prev pos := LWB str;
token count := 0;
token pending := FALSE;
WHILE str pos <= str max
DO
BOOL found delimiter := FALSE;
FOR d FROM LWB delimiters TO UPB delimiters WHILE NOT found delimiter DO
IF LENGTH delimiters[ d ] > 0 THEN
IF found delimiter := str[ str pos : ] STARTSWITH delimiters[ d ] THEN
token count +:= 1;
tokens[ token count ] := ( str[ prev pos : str pos - 1 ], prev pos, delimiters[ d ] );
str pos +:= LENGTH delimiters[ d ];
prev pos := str pos;
token pending := FALSE
FI
FI
OD;
IF NOT found delimiter THEN
# the current character is part of s token #
token pending := TRUE;
str pos +:= 1
FI
OD;
IF token pending THEN
# there is an additional token after the final delimiter #
token count +:= 1;
tokens[ token count ] := ( str[ prev pos : ], prev pos, "" )
FI;
# return an array of the actual tokens #
tokens[ 1 : token count ]
END # SPLIT # ;
# test the SPLIT operator #
[]SPLITINFO test tokens = "a!===b=!=c" SPLIT []STRING( "==", "!=", "=" );
FOR t FROM LWB test tokens TO UPB test tokens DO
SPLITINFO token = test tokens[ t ];
print( ( "token: [", text OF token, "] at: ", whole( position OF token, 0 ), " delimiter: (", delimiter OF token, ")", newline ) )
OD
{{out}}
token: [a] at: 1 delimiter: (!=)
token: [] at: 4 delimiter: (==)
token: [b] at: 6 delimiter: (=)
token: [] at: 8 delimiter: (!=)
token: [c] at: 10 delimiter: ()
AutoHotkey
Str := "a!===b=!=c"
Sep := ["==","!=", "="]
Res := StrSplit(Str, Sep)
for k, v in Res
Out .= (Out?",":"") v
MsgBox % Out
for k, v in Sep
N .= (N?"|":"") "\Q" v "\E"
MsgBox % RegExReplace(str, "(.*?)(" N ")", "$1 {$2}")
{{out}}
a,,b,,c
a {!=} {==}b {=} {!=}c
AWK
# syntax: GAWK -f MULTISPLIT.AWK
BEGIN {
str = "a!===b=!=c"
sep = "(==|!=|=)"
printf("str: %s\n",str)
printf("sep: %s\n\n",sep)
n = split(str,str_arr,sep,sep_arr)
printf("parsed: ")
for (i=1; i<=n; i++) {
printf("'%s'",str_arr[i])
if (i<n) { printf(" '%s' ",sep_arr[i]) }
}
printf("\n\nstrings: ")
for (i=1; i<=n; i++) {
printf("'%s' ",str_arr[i])
}
printf("\n\nseparators: ")
for (i=1; i<n; i++) {
printf("'%s' ",sep_arr[i])
}
printf("\n")
exit(0)
}
{{out}}
str: a!===b=!=c
sep: (==|!=|=)
parsed: 'a' '!=' '' '==' 'b' '=' '' '!=' 'c'
strings: 'a' '' 'b' '' 'c'
separators: '!=' '==' '=' '!='
BBC BASIC
DIM sep$(2)
sep$() = "==", "!=", "="
PRINT "String splits into:"
PRINT FNmultisplit("a!===b=!=c", sep$(), FALSE)
PRINT "For extra credit:"
PRINT FNmultisplit("a!===b=!=c", sep$(), TRUE)
END
DEF FNmultisplit(s$, d$(), info%)
LOCAL d%, i%, j%, m%, p%, o$
p% = 1
REPEAT
m% = LEN(s$)
FOR i% = 0 TO DIM(d$(),1)
d% = INSTR(s$, d$(i%), p%)
IF d% IF d% < m% m% = d% : j% = i%
NEXT
IF m% < LEN(s$) THEN
o$ += """" + MID$(s$, p%, m%-p%) + """"
IF info% o$ += " (" + d$(j%) + ") " ELSE o$ += ", "
p% = m% + LEN(d$(j%))
ENDIF
UNTIL m% = LEN(s$)
= o$ + """" + MID$(s$, p%) + """"
{{out}}
String splits into:
"a", "", "b", "", "c"
For extra credit:
"a" (!=) "" (==) "b" (=) "" (!=) "c"
Bracmat
This is a surprisingly difficult task to solve in Bracmat, because in a naive solution using a alternating pattern ("=="|"!="|"=") the shorter pattern "="
would have precedence over "=="
. In the solution below the function oneOf
iterates (by recursion) over the operators, trying to match the start of the current subject string sjt
with one operator at a time, until success or reaching the end of the list with operators, whichever comes first. If no operator is found at the start of the current subject string, the variable nonOp
is extended with one byte, thereby shifting the start of the current subject string one byte to the right. Then a new attempt is made to find an operator. This is repeated until either an operator is found, in which case the unparsed string is restricted to the part of the input after the found operator, or no operator is found, in which case the whl
loop terminates.
( ( oneOf
= operator
. !arg:%?operator ?arg
& ( @(!sjt:!operator ?arg)&(!operator.!arg)
| oneOf$!arg
)
)
& "a!===b=!=c":?unparsed
& "==" "!=" "=":?operators
& whl
' ( @( !unparsed
: ?nonOp [%(oneOf$!operators:(?operator.?unparsed))
)
& put$(!nonOp str$("{" !operator "} "))
)
& put$!unparsed
& put$\n
);
{{out}}
a {!=} {==} b {=} {!=} c
C
What kind of silly parsing is this?
#include <stdio.h>
#include <string.h>
void parse_sep(const char *str, const char *const *pat, int len)
{
int i, slen;
while (*str != '\0') {
for (i = 0; i < len || !putchar(*(str++)); i++) {
slen = strlen(pat[i]);
if (strncmp(str, pat[i], slen)) continue;
printf("{%.*s}", slen, str);
str += slen;
break;
}
}
}
int main()
{
const char *seps[] = { "==", "!=", "=" };
parse_sep("a!===b=!=c", seps, 3);
return 0;
}
{{out}}
## C++
using the Boost library tokenizer!
```cpp
#include <iostream>
#include <boost/tokenizer.hpp>
#include <string>
int main( ) {
std::string str( "a!===b=!=c" ) , output ;
typedef boost::tokenizer<boost::char_separator<char> > tokenizer ;
boost::char_separator<char> separator ( "==" , "!=" ) , sep ( "!" ) ;
tokenizer mytok( str , separator ) ;
tokenizer::iterator tok_iter = mytok.begin( ) ;
for ( ; tok_iter != mytok.end( ) ; ++tok_iter )
output.append( *tok_iter ) ;
tokenizer nexttok ( output , sep ) ;
for ( tok_iter = nexttok.begin( ) ; tok_iter != nexttok.end( ) ;
++tok_iter )
std::cout << *tok_iter << " " ;
std::cout << '\n' ;
return 0 ;
}
{{out}}
a b c## C# '''Extra Credit Solution''' ```c# using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace Multisplit { internal static class Program { private static void Main(string[] args) { foreach (var s in "a!===b=!=c".Multisplit(true, "==", "!=", "=")) // Split the string and return the separators. { Console.Write(s); // Write the returned substrings and separators to the console. } Console.WriteLine(); } private static IEnumerable
String.Split
method does. Using F# Interactive:
```fsharp>
"a!===b=!=c".Split([|"=="; "!="; "="|], System.StringSplitOptions.None);;
val it : string [] = [|"a"; ""; "b"; ""; "c"|]
> "a!===b=!=c".Split([|"="; "!="; "=="|], System.StringSplitOptions.None);;
val it : string [] = [|"a"; ""; ""; "b"; ""; "c"|]
```
System.StringSplitOptions.None
specifies that empty strings should be included in the result.
## Go
```go
package main
import (
"fmt"
"strings"
)
func ms(txt string, sep []string) (ans []string) {
for txt > "" {
sepMatch := ""
posMatch := len(txt)
for _, s := range sep {
if p := strings.Index(txt, s); p >= 0 && p < posMatch {
sepMatch = s
posMatch = p
}
}
ans = append(ans, txt[:posMatch])
txt = txt[posMatch+len(sepMatch):]
}
return
}
func main() {
fmt.Printf("%q\n", ms("a!===b=!=c", []string{"==", "!=", "="}))
}
```
{{out}}
```txt
["a" "" "b" "" "c"]
```
## Haskell
```Haskell
import Data.List
(isPrefixOf, stripPrefix, genericLength, intercalate)
trysplit :: String -> [String] -> Maybe (String, String)
trysplit s delims =
case filter (`isPrefixOf` s) delims of
[] -> Nothing
(d:_) -> Just (d, (\(Just x) -> x) $ stripPrefix d s)
multisplit :: String -> [String] -> [(String, String, Int)]
multisplit list delims =
let ms [] acc pos = [(acc, [], pos)]
ms l@(s:sx) acc pos =
case trysplit l delims of
Nothing -> ms sx (s : acc) (pos + 1)
Just (d, sxx) -> (acc, d, pos) : ms sxx [] (pos + genericLength d)
in ms list [] 0
main :: IO ()
main = do
let parsed = multisplit "a!===b=!=c" ["==", "!=", "="]
mapM_
putStrLn
[ "split string:"
, intercalate "," $ map (\(a, _, _) -> a) parsed
, "with [(string, delimiter, offset)]:"
, show parsed
]
```
{{out}}
```txt
split string:
a,,b,,c
with [(string, delimiter, offset)]:
[("a","!=",1),("","==",3),("b","=",6),("","!=",7),("c","",10)]
```
Or as a fold:
```haskell
import Data.List (find, isPrefixOf)
multiSplit :: [String] -> String -> [(String, String, Int)]
multiSplit ds s =
let lng = length s
(ts, ps, o) =
foldl
(\(tokens, parts, offset) (c, i) ->
let inDelim = offset > i
in case (if inDelim
then Nothing
else find (`isPrefixOf` drop i s) ds) of
Just x -> ([], (tokens, x, i) : parts, i + length x)
Nothing ->
( if inDelim
then tokens
else c : tokens
, parts
, offset))
([], [], 0)
(zip s [0 .. lng])
in reverse $ (ts, [], lng) : ps
main :: IO ()
main = print $ multiSplit ["==", "!=", "="] "a!===b=!=c"
```
{{Out}}
```txt
[("a","!=",1),("","==",3),("b","=",6),("","!=",7),("c","",10)]
```
=={{header|Icon}} and {{header|Unicon}}==
```Icon
procedure main()
s := "a!===b=!=c"
# just list the tokens
every writes(multisplit(s,["==", "!=", "="])," ") | write()
# list tokens and indices
every ((p := "") ||:= t := multisplit(s,sep := ["==", "!=", "="])) | break write() do
if t == !sep then writes(t," (",*p+1-*t,") ") else writes(t," ")
end
procedure multisplit(s,L)
s ? while not pos(0) do {
t := =!L | 1( arb(), match(!L)|pos(0) )
suspend t
}
end
procedure arb()
suspend .&subject[.&pos:&pos <- &pos to *&subject + 1]
end
```
{{out}}
```txt
a != == b = != c
a != (2) == (4) b = (7) != (8) c
```
## J
```j
multisplit=: 4 :0
'sep begin'=. |: t=. y /:~&.:(|."1)@;@(i.@#@[ ,.L:0"0 I.@E.L:0) x
end=. begin + sep { #@>y
last=. next=. 0
r=. 2 0$0
while. next<#begin do.
r=. r,.(last}.x{.~next{begin);next{t
last=. next{end
next=. 1 i.~(begin>next{begin)*.begin>:last
end.
r=. r,.'';~last}.x
)
```
Explanation:
First find all potentially relevant separator instances, and sort them in increasing order, by starting location and separator index. sep
is separator index, and begin
is starting location. end
is ending location.
Then, loop through the possibilities, skipping over those separators which would overlap with previously used separators.
The result consists of two rows: The first row is the extracted substrings, the second row is the "extra credit" part -- for each extracted substring, the numbers in the second row are the separator index for the following separator (0 for the first separator, 1 for the second, ...), and the location in the original string where the beginning of the separator appeared (which is the same as where the end of the extracted substring appeared). Note that the very last substring does not have a separator following it, so the extra credit part is blank for that substring.
Example use:
```j
S=: 'a!===b=!=c'
S multisplit '==';'!=';'='
┌───┬───┬───┬───┬─┐
│a │ │b │ │c│
├───┼───┼───┼───┼─┤
│1 1│0 3│2 6│1 7│ │
└───┴───┴───┴───┴─┘
S multisplit '=';'!=';'=='
┌───┬───┬───┬───┬───┬─┐
│a │ │ │b │ │c│
├───┼───┼───┼───┼───┼─┤
│1 1│0 3│0 4│0 6│1 7│ │
└───┴───┴───┴───┴───┴─┘
'X123Y' multisplit '1';'12';'123';'23';'3'
┌───┬───┬─┐
│X │ │Y│
├───┼───┼─┤
│0 1│3 2│ │
└───┴───┴─┘
```
## Java
```java
import java.util.*;
public class MultiSplit {
public static void main(String[] args) {
System.out.println("Regex split:");
System.out.println(Arrays.toString("a!===b=!=c".split("==|!=|=")));
System.out.println("\nManual split:");
for (String s : multiSplit("a!===b=!=c", new String[]{"==", "!=", "="}))
System.out.printf("\"%s\" ", s);
}
static List
Perl 6 automatically returns Match objects that will stringify to the matched pattern, but can also be interrogated for their match positions, as illustrated above by post-processing the results two different ways.
## Phix
```Phix
procedure multisplit(string text, sequence delims)
integer k = 1, kdx
while 1 do
integer kmin = 0
for i=1 to length(delims) do
integer ki = match(delims[i],text,k)
if ki!=0 then
if kmin=0 or kichoose
are applied in parallel, and all potentially match at the current position in the text.
However :shortest tok
means that only that clause survives (gets to propagate its bindings and position advancement) which minimizes the length of the string which is bound to the tok
variable.
The :gap 0
makes the horizontal collect repetitions strictly adjacent. This means that coll
will quit when faced with a nonmatching suffix portion of the data rather than scan forward (no gap allowed!). This creates an opportunity for the tail
variable to grab the suffix which remains, which may be an empty string.
```txr
@(next :args)
@(coll :gap 0)@(choose :shortest tok)@\
@tok@{sep /==/}@\
@(or)@\
@tok@{sep /!=/}@\
@(or)@\
@tok@{sep /=/}@\
@(end)@(end)@tail
@(output)
@(rep)"@tok" {@sep} @(end)"@tail"
@(end)
```
Runs:
```txt
$ ./txr multisplit.txr 'a!===b=!=c'
"a" {!=} "" {==} "b" {=} "" {!=} "c"
$ ./txr multisplit.txr 'a!===!==!=!==b'
"a" {!=} "" {==} "" {!=} "" {=} "" {!=} "" {!=} "" {=} "b"
$ ./txr multisplit.txr ''
""
$ ./txr multisplit.txr 'a'
"a"
$ ./txr multisplit.txr 'a='
"a" {=} ""
$ ./txr multisplit.txr '='
"" {=} ""
$ ./txr multisplit.txr '=='
"" {==} ""
$ ./txr multisplit.txr '==='
"" {==} "" {=} ""
```
===Using the tok-str
function===
{{trans|Racket}}
```sh
$ txr -p '(tok-str "a!===b=!=c" #/==|!=|=/ t)'
("a" "!=" "" "==" "b" "=" "" "!=" "c")
```
Here the third boolean argument means "keep the material between the tokens", which in the Racket version seems to be requested by the argument #:gap-select? #:t
.
## UNIX Shell
{{works with|bash}}
```bash
multisplit() {
local str=$1
shift
local regex=$( IFS='|'; echo "$*" )
local sep
while [[ $str =~ $regex ]]; do
sep=${BASH_REMATCH[0]}
words+=( "${str%%${sep}*}" )
seps+=( "$sep" )
str=${str#*$sep}
done
words+=( "$str" )
}
words=() seps=()
original="a!===b=!=c"
recreated=""
multisplit "$original" "==" "!=" "="
for ((i=0; i<${#words[@]}; i++)); do
printf 'w:"%s"\ts:"%s"\n' "${words[i]}" "${seps[i]}"
recreated+="${words[i]}${seps[i]}"
done
if [[ $original == $recreated ]]; then
echo "successfully able to recreate original string"
fi
```
{{out}}
```txt
w:"a" s:"!="
w:"" s:"=="
w:"b" s:"="
w:"" s:"!="
w:"c" s:""
successfully able to recreate original string
```
## VBScript
```vb
Function multisplit(s,sep)
arr_sep = Split(sep,"|")
For i = 0 To UBound(arr_sep)
arr_s = Split(s,arr_sep(i))
s = Join(arr_s,",")
Next
multisplit = s
End Function
Function multisplit_extra(s,sep)
Set dict_sep = CreateObject("Scripting.Dictionary")
arr_sep = Split(sep,"|")
For i = 0 To UBound(arr_sep)
dict_sep.Add i,"(" & arr_sep(i) & ")"
arr_s = Split(s,arr_sep(i))
s = Join(arr_s,i)
Next
For Each key In dict_sep.Keys
s = Replace(s,key,dict_sep.Item(key))
Next
multisplit_extra = s
End Function
WScript.StdOut.Write "Standard: " & multisplit("a!===b=!=c","!=|==|=")
WScript.StdOut.WriteLine
WScript.StdOut.Write "Extra Credit: " & multisplit_extra("a!===b=!=c","!=|==|=")
WScript.StdOut.WriteLine
```
{{out}}
```txt
Standard: a,,b,,c
Extra Credit: a(!=)(==)b(=)(!=)c
```
## Yabasic
```Yabasic
t$ = "a!===b=!=c"
s$ = "==,!=,="
dim n$(1)
n = token(s$, n$(), ",")
dim p(n)
do
l = len(t$)
j = 0
for i = 1 to n
p(i) = instr(t$, n$(i))
if p(i) and p(i) < l then l = p(i) : j = i end if
next
if not j print t$ : break
print left$(t$, l - 1), " with separator ", n$(j)
t$ = right$(t$, len(t$) - (l + len(n$(j))) + 1)
loop
```
## zkl
{{trans|Python}}
```zkl
fcn multisplit(text, sep){
lastmatch := i := 0; matches := List();
while(i < text.len()){
foreach j,s in ([0..].zip(sep)){
if(i == text.find(s,i)){
if(i > lastmatch) matches.append(text[lastmatch,i-lastmatch]);
matches.append(T(j,i)); # Replace the string containing the matched separator with a tuple of which separator and where in the string the match occured
lastmatch = i + s.len();
i += s.len()-1;
break;
}
}
i += 1;
}
if(i > lastmatch) matches.append(text[lastmatch,i-lastmatch]);
return(matches);
}
```
```zkl
multisplit("a!===b=!=c", T("==", "!=", "=")).println();
multisplit("a!===b=!=c", T("!=", "==", "=")).println();
```
{{out}}
```txt
L("a",L(1,1),L(0,3),"b",L(2,6),L(1,7),"c")
L("a",L(0,1),L(1,3),"b",L(2,6),L(0,7),"c")
```