I know that if I want to parse a string into separate words, the easiest way is.
This is my string:
The Microsoft business analysis software
Code:
SELECT special_term, display_term
FROM sys.dm_fts_parser ('"The Microsoft business analysis software"', 1033, 0, 0)
Which returns this:
special_term display_term
------------ ------------
Noise Word the
Exact Match microsoft
Exact Match business
Exact Match analysis
Exact Match software
But suppose I have a string that has some of the words enclosed in double quotes like this - string:
The Microsoft "business analysis" software
I want to produce the following:
special_term display_term
------------ ------------
Noise Word the
Exact Match microsoft
Exact Match business analysis
Exact Match software
Is there a reasonable way to do this in SQL Server?
We can "temporarily" replace spaces by some symbol (_
) in quoted phrases and replace back after splitting.
Phrase "business analysis"
-> business_analysis
->business analysis
.
sys.dm_fts_parser
output this "words" as Exact Match
.
See example
id | sentence |
---|---|
1 | The Microsoft "business analysis" software And other "helpful programs" |
2 | But suppose I have a string that has "some of the words" enclosed in double quotes like this - string |
with data as(
select 1 id
,'The Microsoft "business analysis" software And other "helpful programs"' sentence
union all
select 2 id
,'But suppose I have a string that has "some of the words" enclosed in double quotes like this - string'
)
Recursive replace ' '
by '_'
in quoted phrases and remove"
,r as(
select 0 lvl,id
,case when charindex('"',sentence)>0
and charindex('"',sentence,charindex('"',sentence)+1)>0
then left(sentence,charindex('"',sentence)-1)
+replace(substring(sentence,charindex('"',sentence)+1
,charindex('"',sentence,charindex('"',sentence)+1)-charindex('"',sentence)-1)
,' ','_')
+right(sentence,len(sentence)-charindex('"',sentence,charindex('"',sentence)+1))
else sentence
end sentence
from data
union all
select lvl+1,id
,case when charindex('"',sentence)>0
and charindex('"',sentence,charindex('"',sentence)+1)>0
then substring(sentence,1,charindex('"',sentence)-1)
+replace(substring(sentence,charindex('"',sentence)+1
,charindex('"',sentence,charindex('"',sentence)+1)-charindex('"',sentence)-1)
,' ','_')
+substring(sentence,charindex('"',sentence,charindex('"',sentence)+1)+1,100)
else sentence
end sentence
from r
where charindex('"',sentence)>0
)
select id,occurrence,special_term,replace(display_term,'_',' ')display_term
--,source_term ,group_id,phrase_id
FROM (select *,row_number()over(partition by id order by lvl desc)rn from r)r
cross apply sys.dm_fts_parser ('"'+r.sentence+'"',1033,0,0)
where rn=1
Output
id | occurrence | special_term | display_term |
---|---|---|---|
1 | 1 | Noise Word | the |
1 | 2 | Exact Match | microsoft |
1 | 3 | Exact Match | business analysis |
1 | 4 | Exact Match | software |
1 | 5 | Noise Word | and |
1 | 6 | Noise Word | other |
1 | 7 | Exact Match | helpful programs |
2 | 1 | Noise Word | but |
2 | 2 | Exact Match | suppose |
2 | 3 | Noise Word | i |
2 | 4 | Noise Word | have |
2 | 5 | Noise Word | a |
2 | 6 | Exact Match | string |
2 | 7 | Noise Word | that |
2 | 8 | Noise Word | has |
2 | 9 | Exact Match | some of the words |
2 | 10 | Exact Match | enclosed |
2 | 11 | Noise Word | in |
2 | 12 | Exact Match | double |
2 | 13 | Exact Match | quotes |
2 | 14 | Noise Word | like |
2 | 15 | Noise Word | this |
2 | 16 | Exact Match | string |