I'm implementing a tool that lets users search for terms in texts. I'm currently focused on handling more complex input from the search.
The operators I am looking to support are :
This code should go into the Python backend that builds the query to the database engine, so I need a way to parse the query to transform the appropriate parts. Are there modules that would allow me to do that ?
I have tried looking at NLTK's logic package but it seems to do way too much things and it's not clear to me how to reduce it to these functions. I think I need something like NLTK's grammar trees but all I've found are packages that add grammatical tags and thus are tied to a language model.
I would use PLY (Python Lex-Yacc) or SLY to parse it.
(PLY
and SLY
have the same author but PLY
uses functions and SLY
uses classes)
I took example calc.py
from SLY
and created code which converts query like
^(abc & def) | xyz
to nested list
['OR', ['NOT', ['AND', 'abc', 'def']], 'xyz']
which should be easy to use to generate SQL query
Other example(s):
(A&B)|(^C&D)
==> ['OR', ['AND', 'A', 'B'], ['AND', ['NOT', 'C'], 'D']]
I changed TEXT
so it can catch A B C
as one string without using " "
.
And if you use "A B C"
then it returns it as one string without " "
.
But it allows to use "
inside text (if there is no "
at the beginning and at the end).
A"B"C & "F G H" & I J K
==> ['AND', ['AND', 'A"B"C', 'F G H'], 'I J K']
`
But it doesn't allow to use chars &|^!~()
inside text. It would need some changes in TEXT
and in functions.
"A&B"
==> ['AND', 'A', 'B']
I added
!
as NOT
because Python uses !=
as not equal
so I was writing !
automatically in query~
as NOT
because I often use it in pandas
for negations so sometimes I was writing ~
automatically in queryfrom sly import Lexer, Parser
import readline # it allows to use keys `arrow up` `arrow down` to see previous queries in current session
class QueryLexer(Lexer):
tokens = { TEXT }
ignore = ' \t'
literals = { '&', '|', '^', '!', '~', '(', ')'}
# Tokens
#TEXT = r'[a-zA-Z_][a-zA-Z0-9_]*'
TEXT = r'[^&|^!~()]{1,}' # at least one char which is not in `literals`
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class QueryParser(Parser):
tokens = QueryLexer.tokens
precedence = (
('left', '&', '|', '!', '~'),
('right', '^'),
)
@_('expr')
def statement(self, p):
return p.expr
@_('expr "&" expr')
def expr(self, p):
return ['AND', p.expr0, p.expr1]
@_('expr "|" expr')
def expr(self, p):
return ['OR', p.expr0, p.expr1]
@_('"^" expr', '"!" expr', '"~" expr')
def expr(self, p):
return ['NOT', p.expr]
@_('"(" expr ")"')
def expr(self, p):
return p.expr
@_('TEXT')
def expr(self, p):
return p.TEXT.strip(' ').strip('"')
if __name__ == '__main__':
lexer = QueryLexer()
parser = QueryParser()
while True:
try:
text = input('query > ')
except EOFError:
break
except KeyboardInterrupt:
break
if text:
result = parser.parse(lexer.tokenize(text))
if isinstance(result, str):
print(f'text: >{result}<') # I uses `> <` to see if there are spaces
else:
print(f'list: {result}')
#import json
#print(json.dumps(result, indent=1))
Here version with more modifications
Original version converts A & B & C & D
to ['AND', ['AND', ['AND', 'A', 'B'], 'C'], 'D']
, and new version can use function flatten()
to create ['AND', 'A', 'B', 'C', 'D']
If you send .flat
or .notflat
as query then it switch variable FLATTEN
which decide if it should use flatten()
.
I also added function which tries to convert it to SQL query with [VAR] LIKE "%text%"
and for list ['AND', 'A', 'B', 'C', 'D']
it gives string
([VAR] LIKE "%A%" AND [VAR] LIKE "%B%" AND [VAR] LIKE "%C%" AND [VAR] LIKE "%D%")
So you have to only replace [VAR]
with your variable.
But I don't know if I should add %
automatically or user should ask A% & %B
because it allows to check if text starts with A
and ends with B
Eventually user could use A* & *B & C
and code should convert it to A% AND %B AND %C%
(add %
automatically if there is no *
at both sides.
I added also function which saves history in file and read it at next start.
I added also json to display data as
[
"AND",
"A",
[
"OR",
"B",
"C",
"D"
],
[
"NOT",
"X"
]
]
I added function .history
(and shortcut .h
) to see history.
I also added function to select field like day:*h*day & title:Hello
which gives (day LIKE "%h%day" AND title LIKE "%Hello%")
from sly import Lexer, Parser
import readline
import atexit
import os
import json
#COLOR = '\x1b[1;31m' # red
COLOR = '\x1b[1;32m' # green
RESET = '\x1b[m'
BASE = os.path.dirname(os.path.abspath(__file__))
HISTORY_PATH = os.path.join(BASE, ".history")
MAKE_FLAT = True
DEFAULT_VAR = '[VAR]'
class QueryLexer(Lexer):
tokens = { TEXT }
ignore = ' \t'
literals = { '&', '|', '^', '!', '~', '(', ')'}
# Tokens
#WORD = r'[^&|^!~()"]{1,}' # at least one char which is not in `literals`
#PHRASE = r'"[^"]*"'
TEXT = r'"[^"]*"|[^&|^!~()]{1,}' # at least one char which is not in `literals`
@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index += 1
class QueryParser(Parser):
tokens = QueryLexer.tokens
precedence = (
('left', '&', '|', '!', '~'),
('right', '^'),
)
@_('expr')
def statement(self, p):
return p.expr
@_('expr "&" expr')
def expr(self, p):
result = ['AND', p.expr0, p.expr1]
if MAKE_FLAT:
result = flatten(result)
return result
@_('expr "|" expr')
def expr(self, p):
result = ['OR', p.expr0, p.expr1]
if MAKE_FLAT:
result = flatten(result)
return result
@_('"^" expr', '"!" expr', '"~" expr')
def expr(self, p):
return ['NOT', p.expr]
@_('"(" expr ")"')
def expr(self, p):
return p.expr
@_('TEXT')
def expr(self, p):
return p.TEXT.strip(' ')
def flatten(data):
key = data[0]
values = []
for item in data[1:]:
if isinstance(item, list) and item[0] == key:
values.extend(item[1:])
else:
values.append(item)
return [key, *values]
def generate(data):
if isinstance(data, str):
text = data
var = DEFAULT_VAR
if not text.startswith('"') and (':' in text):
var, text = text.split(':', 1)
print(var, text)
text = text.strip('"')
if '*' in text:
text = text.replace('*', '%')
if '%' not in text:
text = f'%{text}%'
return f'{var} LIKE "{text}"'
key = data[0]
values = data[1:]
if key in ['NOT']:
text = data[1]
var = DEFAULT_VAR
if not text.startswith('"') and (':' in text):
var, text = text.split(':', 1)
print(var, text)
text = text.strip('"')
if '*' in text:
text = text.replace('*', '%')
if '%' not in text:
text = f'%{text}%'
return f'{var} NOT LIKE "{text}"'
if key in ['AND', 'OR']:
key = f' {key} ' # add spaces
# convert to `var LIKE value`
values = [generate(item) for item in values]
# join using AND, OR
text = key.join(values)
else:
text = str(data)
return f'({text})' # add ( )
if __name__ == '__main__':
# read history at the beginning
try:
readline.read_history_file(HISTORY_PATH)
readline.set_history_length(1000)
except FileNotFoundError:
pass
# write history at the end
atexit.register(readline.write_history_file, HISTORY_PATH)
lexer = QueryLexer()
parser = QueryParser()
number = 0
while True:
try:
number += 1
text = input(f'[{number}] {COLOR}query >{RESET} ')
except EOFError:
break
except KeyboardInterrupt:
break
if text:
result = parser.parse(lexer.tokenize(text))
if isinstance(result, str):
print(f'text: >{result}<') # I uses `> <` to see if there are spaces
cmd = result.split(' ')
if cmd[0] in ('.history', '.h') :
for index in range(1, readline.get_current_history_length()+1):
print(f'[{index}]', readline.get_history_item(index))
elif cmd[0] in ('.flat', '.f'):
MAKE_FLAT = True
print('MAKE_FLAT:', MAKE_FLAT)
elif cmd[0] == ('.notflat', '.nf'):
MAKE_FLAT = False
print('MAKE_FLAT:', MAKE_FLAT)
elif cmd[0] in ('.var', '.v'):
if len(cmd) > 1:
DEFAULT_VAR = cmd[1]
print('DEFAULT_VAR:', DEFAULT_VAR)
else:
print(f'data: {result}')
print(json.dumps(result, indent=1))
print(generate(result))
else:
print(f'data: {result}')
print(json.dumps(result, indent=1))
print(generate(result))
.notflat
.flat