SQL解析系列(Python)--sqlparse源码

徐可甲

1、简介

sqlparse是一个无验证的SQL解析器。它提供了解析、拆分、格式化SQL语句的能力。

代码： https:// github.com/andialbrecht /sqlparse

sqlparse提供了三个基本的函数，用于SQL语句处理。

split

拆分包含多个SQL语句的字符串为SQL语句列表。
语句结尾通过 ; 分隔。

>>> import sqlparse
>>> sql = 'select * from foo; select * from bar;'
>>> sqlparse.split(sql)
[u'select * from foo; ', u'select * from bar;']

format

将SQL语句格式化，以便更清晰的展现。

>>> sql = 'select * from foo where id in (select id from bar);'
>>> print(sqlparse.format(sql, reindent=True, keyword_case='upper'))
SELECT *
FROM foo
WHERE id IN
  (SELECT id
   FROM bar);

parse

parse()返回sql解析结果tuple。
tuple的每个元素对应于split()的一个SQL语句。
解析的结果也可以通过str()生成原始的SQL语句。

>>> parsed = sqlparse.parse(sql)
>>> parsed
(<Statement 'select...' at 0x9ad08ec>,)
>>> stmt = parsed[0]  # grab the Statement object
>>> stmt.tokens
(<DML 'select' at 0x9b63c34>,
 <Whitespace ' ' at 0x9b63e8c>,
 <Operator '*' at 0x9b63e64>,
 <Whitespace ' ' at 0x9b63c5c>,
 <Keyword 'from' at 0x9b63c84>,
 <Whitespace ' ' at 0x9b63cd4>,
 <Identifier '"somes...' at 0x9b5c62c>,
 <Whitespace ' ' at 0x9b63f04>,
 <Where 'where ...' at 0x9b5caac>)
>>> str(stmt)  # str(stmt) for Python 3
'select * from "someschema"."mytable" where id = 1'
>>> str(stmt.tokens[-1])  # or just the WHERE part
'where id = 1'

2、源码解析

2.1 代码结构

sqlparse
├── engine                        # 解析引擎
│   ├── __init__.py 
│   ├── filter_stack.py
│   ├── grouping.py               # SQL语法解析
│   └── statement_splitter.py     # SQL语句切分
├── filters                       # format的过滤项
│   ├── __init__.py
│   ├── aligned_indent.py
│   ├── others.py
│   ├── output.py
│   ├── reindent.py
│   ├── right_margin.py
│   └── tokens.py
├── __init__.py                   # split/parse/format入口
├── __main__.py
├── exceptions.py
├── cli.py
├── formatter.py                  # format参数
├── keywords.py                   # 关键词匹配规则
├── lexer.py                      # 词法解析器
├── sql.py                        # sql语法定义
├── tokens.py                     # token定义
└── utils.py

2.2 词法解析(tokenize)

关键词匹配规则：keywords.py

左侧为匹配正则表达式，右侧为类型或者是函数is_keyword。
测试用例： https:// github.com/messixukejia /sqlparse/blob/master/tests/test_keywords.py

def is_keyword(value):
    val = value.upper()
    return (KEYWORDS_COMMON.get(val)
            or KEYWORDS_ORACLE




    
.get(val)
            or KEYWORDS_PLPGSQL.get(val)
            or KEYWORDS_HQL.get(val)
            or KEYWORDS.get(val, tokens.Name)), value
SQL_REGEX = {
    'root': [
        (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
        (r'GROUP\s+BY\b', tokens.Keyword),
        (r'[0-9_A-ZÀ-Ü][_$#\w]*', is_keyword),
        (r'[;:()\[\],\.]', tokens.Punctuation),
SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']]

词法解析器：lexer.py

Lexer::get_tokens 对于原始输入的sql语句，逐字符匹配SQL_REGEX左半部分的正则表达式，识别出token_list。
测试用例： https:// github.com/messixukejia /sqlparse/blob/master/tests/test_tokenize.py

样例：

# 输入
sql: select * from foo;
(Token.Keyword.DML, 'select')
(Token.Text.Whitespace, ' ')
(Token.Wildcard, '*')
(Token.Text.Whitespace, ' ')
(Token.Keyword, 'from')
(Token.Text.Whitespace, ' ')
(Token.Name, 'foo')
(Token.Punctuation, ';')

2.3 语句拆分(sqlparse.split)

语句切分程序：filter_stack.py -> statement_splitter.py

class StatementSplitter:
    # stream: 词法解析器的generator object Lexer.get_tokens
        def process(self, stream):
        """Process the stream"""
        EOS_TTYPE = T.Whitespace, T.Comment.Single
        # Run over all stream tokens
        for ttype, value in stream:
            # Yield token if we finished a statement and there's no whitespaces
            # It will count newline token as a non whitespace. In this context
            # whitespace ignores newlines.
            # why don't multi line comments also count?
            # 新的一行。输出sql.Statement
            if self.consume_ws and ttype not in EOS_TTYPE:
                yield sql.Statement(self.tokens)
                # Reset filter and prepare to process next statement
                self._reset()
            # Change current split level (increase, decrease or remain equal)
            # level标示是否有语句嵌套，例如()、BEGIN、END等。
            self.level += self._change_splitlevel(ttype, value)
            # Append the token to the current statement
            self.tokens.append(sql.Token(ttype, value))
            # Check if we get the end of a statement
            # self.level<=0说明没有嵌套语句，标点符号；标示语句切分
            if self.level <= 0 and ttype is T.Punctuation and value == ';':
                self.consume_ws = True
        # Yield pending statement (if any)
        if self.tokens and not all(t.is_whitespace for t in self.tokens):
            yield sql.Statement(self.tokens)

测试用例： https:// github.com/messixukejia /sqlparse/blob/master/tests/test_split.py
样例：

# 输入
sql: select * from foo; select * from bar;
# 中间结果，存入self.tokens(词法解析结果)
Token.Keyword.DML select
Token.Text.Whitespace  
Token.Wildcard *
Token.Text.Whitespace  
Token.Keyword from
Token.Text.Whitespace  
Token.Name foo
Token.Punctuation ;
Token.Text.Whitespace  
Token.Keyword.DML select
Token.Text.Whitespace  
Token.Wildcard *
Token.Text.Whitespace  
Token.Keyword from
Token.Text.Whitespace  
Token.Name bar
Token.Punctuation ;
# 通过;切分后，sql.Statement(self.tokens)聚合，输出类型Statement。
# 最终输出：Statement类型转换为string
select * from foo; 
select * from bar;

2.4 语法解析(sqlparse.parse)

语法解析器：filter_stack.py -> statement_splitter.py grouping.py
group代码：grouping.py。定义了各种SQL语法的解析规则，每种规则一般分为两部：

识别语法相关语句，依赖token_next_by。
调整语法树结构，依赖group_tokens。

# group语法规则
def group(stmt):
    for func in [
        group_comments,
        # _group_matching
        group_brackets,
        group_parenthesis,
        group_case,
        group_if,
        group_for,
        group_begin,
        group_functions,
        group_where,
        group_period,
        group_arrays,
        group_identifier,
        group_order,
        group_typecasts,
        group_tzcasts,
        group_typed_literal,
        group_operator,
        group_comparison,
        group_as,
        group_aliased,
        group_assignment,
        align_comments,
        group_identifier_list,
        group_values,
        func(stmt)
    return stmt

语法解析token表示方式

# 类型




    

## :param token:
## :param i: Class or Tuple/List of Classes. 对应sql.py
## :param m: Tuple of TokenType & Value. Can be list of Tuple for multiple. 对应token.py + value
## :param t: TokenType or Tuple/List of TokenTypes. 对应tokens.py
# sql.py
## 基本类型NameAliasMixin/Token/TokenList。其他类型都是基于他们定义。
class Statement(TokenList):
    """Represents a SQL statement."""        
class Identifier(NameAliasMixin, TokenList):
    """Represents an identifier. Identifiers may have aliases or typecasts."""
class IdentifierList(TokenList):
    """A list of :class:`~sqlparse.sql.Identifier`\'s."""
class Where(TokenList):
    """A WHERE clause."""
class Comparison(TokenList):
    """A comparison used for example in WHERE clauses."""
class Having(TokenList):
    """A HAVING clause."""

group样例可以看出group的作用。本样例使用group_parenthesis对()进行了group。

# 输入
sql: select (select (x3) x2) and (y2) bar
# StatementSplitter().process输出结果
|- 0 DML 'select'
|- 1 Whitespace ' '
|- 2 Punctuation '('
|- 3 DML 'select'
|- 4 Whitespace ' '
|- 5 Punctuation '('
|- 6 Name 'x3'
|- 7 Punctuation ')'
|- 8 Whitespace ' '
|- 9 Name 'x2'
|- 10 Punctuation ')'
|- 11 Whitespace ' '
|- 12 Keyword 'and'
|- 13 Whitespace ' '
|- 14 Punctuation '('
|- 15 Name 'y2'
|- 16 Punctuation ')'
|- 17 Whitespace ' '
`- 18 Name 'bar'
# group之后的结果，形成了语法树层次。
|- 0 DML 'select'
|- 1 Whitespace ' '
|- 2 Parenthesis '(selec...'  ### sql.Parenthesis
|  |- 0 Punctuation '('       ### token
|  |- 1 DML 'select'
|  |- 2 Whitespace ' '
|  |- 3 Identifier '(x3) x2'  ### sql.Identifier
|  |  |- 0 Parenthesis '(x3)' ### sql.Parenthesis
|  |  |  |- 0 Punctuation '(' ### T.Punctuation
|  |  |  |- 1 Identifier 'x3'
|  |  |  |  `- 0 Name 'x3'
|  |  |  `- 2 Punctuation ')'
|  |  |- 1 Whitespace ' '
|  |  `- 2 Identifier 'x2'
|  |     `- 0 Name 'x2'
|  `- 4 Punctuation ')'
|- 3 Whitespace ' '
|- 4 Keyword 'and'
|- 5 Whitespace ' '
`- 6 Identifier '(y2) b...'
   |- 0 Parenthesis '(y2)'
   |  |- 0 Punctuation '('
   |  |- 1 Identifier 'y2'
   |  |  `- 0 Name 'y2'
   |  `- 2 Punctuation ')'
   |- 1 Whitespace ' '
   `- 2 Identifier 'bar'
      `- 0 Name 'bar'

parse样例：

# 输入
sql: select a0, b0, c0, d0, e0 from (select * from dual) q0 where 1=1 and 2=2
"|- 0 DML 'select'",
"|- 1 Whitespace ' '",
"|- 2 IdentifierList 'a0, b0...'",
"|  |- 0 Identifier 'a0'",
"|  |  `- 0 Name 'a0'",
"|  |- 1 Punctuation ','",
"|  |- 2 Whitespace ' '",
"|  |- 3 Identifier 'b0'",
"|  |  `- 0 Name 'b0'",
"|  |- 4 Punctuation ','",
"|  |- 5 Whitespace ' '",
"|  |- 6 Identifier 'c0'",
"|  |  `- 0 Name 'c0'",
"|  |- 7 Punctuation ','",
"|  |- 8 Whitespace ' '",
"|  |- 9 Identifier 'd0'",
"|  |  `- 0 Name 'd0'",
"|  |- 10 Punctuation ','",
"|  |- 11 Whitespace ' '",
"|  `- 12 Float 'e0'",
"|- 3 Whitespace ' '",
"|- 4 Keyword 'from'",
"|- 5 Whitespace ' '",
"|- 6 Identifier '(selec...'",
"|  |- 0 Parenthesis '(selec...'",
"|  |  |- 0 Punctuation '('",
"|  |  |- 1 DML 'select'",
"|  |  |- 2 Whitespace ' '",
"|  |  |- 3 Wildcard '*'",
"|  |  |- 4 Whitespace ' '",
"|  |  |- 5 Keyword 'from'",
"|  |  |- 6 Whitespace ' '",
"|  |  |- 7 Identifier 'dual'",
"|  |  |  `- 0 Name 'dual'",
"|  |  `- 8 Punctuation ')'",
"|  |- 1 Whitespace ' '",
"|  `- 2 Identifier 'q0'",
"|     `- 0 Name 'q0'",
"|- 7 Whitespace ' '",
"`- 8 Where 'where ...'",
"   |- 0 Keyword 'where'",
"   |- 1 Whitespace ' '",
"   |- 2 Comparison '1=1'",
"   |  |- 0 Integer '1'",
"   |  |- 1 Comparison '='",
"   |  `- 2 Integer '1'",
"   |- 3 Whitespace ' '",
"   |- 4 Keyword 'and'",
"   |- 5 Whitespace ' '",
"   `- 6 Comparison '2=2'",
"      |- 0 Integer '2'",
"      |- 1 Comparison '='",
"      `- 2 Integer '2'",
""

测试用例： https:// github.com/messixukejia /sqlparse/blob/master/tests/test_grouping.py
测试代码： https:// github.com/messixukejia /sqlparse/blob/master/tests/test_parse.py

2.5 SQL格式化(sqlparse.format)

格式化

# 根据options参数，设定self.preprocess self.stmtprocess self.postprocess回调
def build_filter_stack(stack, options)
# StatementSplitter().process前后进行filter处理
class FilterStack:
    def run(self, sql, encoding=None):
        stream = lexer.tokenize(sql, encoding)
        # Process token stream
        for filter_ in self.preprocess:
            stream = filter_.process(stream)
        stream = StatementSplitter().process(stream)
        # Output: Stream processed Statements
        for stmt in stream:
            if self._grouping:
                stmt = grouping.group(stmt)
            for filter_ in self.stmtprocess:
                filter_.process(stmt)
            for filter_ in self.postprocess:
                stmt = filter_.process(stmt)
            yield stmt

使用reindent方式格式化后的样例：

# 输入
SQL: select * from foo, bar where bar.id = foo.bar_id having sum(bar.value) > 100