83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
#
|
|
# Copyright (C) 2009-2020 the sqlparse authors and contributors
|
|
# <see AUTHORS file>
|
|
#
|
|
# This module is part of python-sqlparse and is released under
|
|
# the BSD License: https://opensource.org/licenses/BSD-3-Clause
|
|
|
|
"""SQL Lexer"""
|
|
|
|
# This code is based on the SqlLexer in pygments.
|
|
# http://pygments.org/
|
|
# It's separated from the rest of pygments to increase performance
|
|
# and to allow some customizations.
|
|
|
|
from io import TextIOBase
|
|
|
|
from sqlparse import tokens
|
|
from sqlparse.keywords import SQL_REGEX
|
|
from sqlparse.utils import consume
|
|
|
|
|
|
class Lexer:
|
|
"""Lexer
|
|
Empty class. Leaving for backwards-compatibility
|
|
"""
|
|
|
|
@staticmethod
|
|
def get_tokens(text, encoding=None):
|
|
"""
|
|
Return an iterable of (tokentype, value) pairs generated from
|
|
`text`. If `unfiltered` is set to `True`, the filtering mechanism
|
|
is bypassed even if filters are defined.
|
|
|
|
Also preprocess the text, i.e. expand tabs and strip it if
|
|
wanted and applies registered filters.
|
|
|
|
Split ``text`` into (tokentype, text) pairs.
|
|
|
|
``stack`` is the initial stack (default: ``['root']``)
|
|
"""
|
|
if isinstance(text, TextIOBase):
|
|
text = text.read()
|
|
|
|
if isinstance(text, str):
|
|
pass
|
|
elif isinstance(text, bytes):
|
|
if encoding:
|
|
text = text.decode(encoding)
|
|
else:
|
|
try:
|
|
text = text.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text = text.decode('unicode-escape')
|
|
else:
|
|
raise TypeError("Expected text or file-like object, got {!r}".
|
|
format(type(text)))
|
|
|
|
iterable = enumerate(text)
|
|
for pos, char in iterable:
|
|
for rexmatch, action in SQL_REGEX:
|
|
m = rexmatch(text, pos)
|
|
|
|
if not m:
|
|
continue
|
|
elif isinstance(action, tokens._TokenType):
|
|
yield action, m.group()
|
|
elif callable(action):
|
|
yield action(m.group())
|
|
|
|
consume(iterable, m.end() - pos - 1)
|
|
break
|
|
else:
|
|
yield tokens.Error, char
|
|
|
|
|
|
def tokenize(sql, encoding=None):
|
|
"""Tokenize sql.
|
|
|
|
Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
|
|
of ``(token type, value)`` items.
|
|
"""
|
|
return Lexer().get_tokens(sql, encoding)
|