I have these three functions that successfully remove all non-numeric characters from a given string:
The first function loops through the characters of the input string, and if the current character is a number, it adds it to a new string that is returned as the result of the function.
function RemoveNonNumericChars(const s: string): string;
begin
Result := '';
for var i := 1 to Length(s) do
begin
if s[i] in ['0'..'9'] then
Result := Result + s[i];
end;
end;
The second function loops through the characters of the input string from right to left, and if the current character is not a number, it uses the Delete
function to remove it from the string
function RemoveNonNumericChars(const s: string): string;
begin
Result := s;
for var i := Length(Result) downto 1 do
begin
if not(Result[i] in ['0'..'9']) then
Delete(Result, i, 1);
end;
end;
The third function uses a regular expression to replace all non-numeric characters with nothing, thus removing them. TRegEx
is from the System.RegularExpressions
unit.
function RemoveNonNumericChars(const s: string): string;
begin
var RegEx := TRegEx.Create('[^0-9]');
Result := RegEx.Replace(s, '');
end;
All three of them do what I need, but I want to know if there is maybe a built-in function in Delphi for this... Or maybe even a better way to do it than the way I'm doing it. What's the best and/or fastest way to remove non-numeric characters from a string in Delphi?
Both your approaches are slow because you constantly change the length of the string. Also, they only recognise Arabic digits.
To solve the performance issue, preallocate the maximum result length:
function RemoveNonDigits(const S: string): string;
begin
SetLength(Result, S.Length);
var LActualLength := 0;
for var i := 1 to S.Length do
if CharInSet(S[i], ['0'..'9']) then
begin
Inc(LActualLength);
Result[LActualLength] := S[i];
end;
SetLength(Result, LActualLength);
end;
To support non-Arabic digits, use the TCharacter.IsDigit
function:
function RemoveNonDigits(const S: string): string;
begin
SetLength(Result, S.Length);
var LActualLength := 0;
for var i := 1 to S.Length do
if S[i].IsDigit then
begin
Inc(LActualLength);
Result[LActualLength] := S[i];
end;
SetLength(Result, LActualLength);
end;
To optimise even further, as suggested by Stefan Glienke, you can bypass the RTL's string handling machinery and write each character directly with some loss of code readability:
function RemoveNonDigits(const S: string): string;
begin
SetLength(Result, S.Length);
var ResChr := PChar(Result);
var LActualLength := 0;
for var i := 1 to S.Length do
if CharInSet(S[i], ['0'..'9']) then
begin
Inc(LActualLength);
ResChr^ := S[i];
Inc(ResChr);
end;
SetLength(Result, LActualLength);
end;
Benchmark
Just for fun I did a very primitive benchmark on random input strings with length < 100 and about 24% chance of a char being a digit:
program Benchmark;
{$APPTYPE CONSOLE}
{$R *.res}
uses
System.SysUtils, System.RegularExpressions, Windows;
function OP1(const s: string): string;
begin
Result := '';
for var i := 1 to Length(s) do
begin
if s[i] in ['0'..'9'] then
Result := Result + s[i];
end;
end;
function OP2(const s: string): string;
begin
Result := s;
for var i := Length(Result) downto 1 do
begin
if not(Result[i] in ['0'..'9']) then
Delete(Result, i, 1);
end;
end;
function OP3(const s: string): string;
begin
var RegEx := TRegEx.Create('[^0-9]');
Result := RegEx.Replace(s, '');
end;
function AR1(const S: string): string;
begin
SetLength(Result, S.Length);
var LActualLength := 0;
for var i := 1 to S.Length do
if CharInSet(S[i], ['0'..'9']) then
begin
Inc(LActualLength);
Result[LActualLength] := S[i];
end;
SetLength(Result, LActualLength);
end;
function AR2(const S: string): string;
begin
SetLength(Result, S.Length);
var ResChr := PChar(Result);
var LActualLength := 0;
for var i := 1 to S.Length do
if CharInSet(S[i], ['0'..'9']) then
begin
Inc(LActualLength);
ResChr^ := S[i];
Inc(ResChr);
end;
SetLength(Result, LActualLength);
end;
function AR3(const S: string): string;
begin
SetLength(Result, S.Length);
var ResChr := PChar(Result);
for var i := 1 to S.Length do
if CharInSet(S[i], ['0'..'9']) then
begin
ResChr^ := S[i];
Inc(ResChr);
end;
SetLength(Result, ResChr - PChar(Result));
end;
function RandomInputString: string;
begin
SetLength(Result, Random(100));
for var i := 1 to Result.Length do
Result[i] := Chr(Ord('0') + Random(42));
end;
begin
Randomize;
const N = 1000000;
var Data := TArray<string>(nil);
SetLength(Data, N);
for var i := 0 to N - 1 do
Data[i] := RandomInputString;
var f, c0, cOP1, cOP2, cOP3, cAR1, cAR2, cAR3: Int64;
QueryPerformanceFrequency(f);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
OP1(Data[i]);
QueryPerformanceCounter(cOP1);
Dec(cOP1, c0);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
OP2(Data[i]);
QueryPerformanceCounter(cOP2);
Dec(cOP2, c0);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
OP3(Data[i]);
QueryPerformanceCounter(cOP3);
Dec(cOP3, c0);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
AR1(Data[i]);
QueryPerformanceCounter(cAR1);
Dec(cAR1, c0);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
AR2(Data[i]);
QueryPerformanceCounter(cAR2);
Dec(cAR2, c0);
QueryPerformanceCounter(c0);
for var i := 0 to High(Data) do
AR3(Data[i]);
QueryPerformanceCounter(cAR3);
Dec(cAR3, c0);
Writeln('Computations per second:');
Writeln('OP1: ', Round(N / (cOP1 / f)));
Writeln('OP2: ', Round(N / (cOP2 / f)));
Writeln('OP3: ', Round(N / (cOP3 / f)));
Writeln('AR1: ', Round(N / (cAR1 / f)));
Writeln('AR2: ', Round(N / (cAR2 / f)));
Writeln('AR3: ', Round(N / (cAR3 / f)));
Readln;
end.
Result:
Computations per second:
OP1: 1398134
OP2: 875116
OP3: 39162
AR1: 3406172
AR2: 4063260
AR3: 4032343
As you can see, in this test at least, regular expressions are by far the slowest approach. And preallocating makes a major difference, while avoiding the _UniqueStringU
issue appears to make only a relatively minor improvement.
But even with the very slow RegEx approach, you can do 40 000 calls per second. On my 13-year-old computer.