SQL解析系列(Python)--sqlparse源码

1、简介

sqlparse是一个无验证的SQL解析器。它提供了解析、拆分、格式化SQL语句的能力。

代码: github.com/andialbrecht

sqlparse提供了三个基本的函数,用于SQL语句处理。

  • split
    • 拆分包含多个SQL语句的字符串为SQL语句列表。
    • 语句结尾通过 ; 分隔。
>>> import sqlparse
>>> sql = 'select * from foo; select * from bar;'
>>> sqlparse.split(sql)
[u'select * from foo; ', u'select * from bar;']
  • format
    • 将SQL语句格式化,以便更清晰的展现。
>>> sql = 'select * from foo where id in (select id from bar);'
>>> print(sqlparse.format(sql, reindent=True, keyword_case='upper'))
SELECT *
FROM foo
WHERE id IN
  (SELECT id
   FROM bar);
  • parse
    • parse()返回sql解析结果tuple。
    • tuple的每个元素对应于split()的一个SQL语句。
    • 解析的结果也可以通过str()生成原始的SQL语句。
>>> parsed = sqlparse.parse(sql)
>>> parsed
(<Statement 'select...' at 0x9ad08ec>,)
>>> stmt = parsed[0]  # grab the Statement object
>>> stmt.tokens
(<DML 'select' at 0x9b63c34>,
 <Whitespace ' ' at 0x9b63e8c>,
 <Operator '*' at 0x9b63e64>,
 <Whitespace ' ' at 0x9b63c5c>,
 <Keyword 'from' at 0x9b63c84>,
 <Whitespace ' ' at 0x9b63cd4>,
 <Identifier '"somes...' at 0x9b5c62c>,
 <Whitespace ' ' at 0x9b63f04>,
 <Where 'where ...' at 0x9b5caac>)
>>> str(stmt)  # str(stmt) for Python 3
'select * from "someschema"."mytable" where id = 1'
>>> str(stmt.tokens[-1])  # or just the WHERE part
'where id = 1'

2、源码解析

2.1 代码结构

sqlparse
├── engine                        # 解析引擎
│   ├── __init__.py 
│   ├── filter_stack.py
│   ├── grouping.py               # SQL语法解析
│   └── statement_splitter.py     # SQL语句切分
├── filters                       # format的过滤项
│   ├── __init__.py
│   ├── aligned_indent.py
│   ├── others.py
│   ├── output.py
│   ├── reindent.py
│   ├── right_margin.py
│   └── tokens.py
├── __init__.py                   # split/parse/format入口
├── __main__.py
├── exceptions.py
├── cli.py
├── formatter.py                  # format参数
├── keywords.py                   # 关键词匹配规则
├── lexer.py                      # 词法解析器
├── sql.py                        # sql语法定义
├── tokens.py                     # token定义
└── utils.py

2.2 词法解析(tokenize)

def is_keyword(value):
    val = value.upper()
    return (KEYWORDS_COMMON.get(val)
            or KEYWORDS_ORACLE.get(val)
            or KEYWORDS_PLPGSQL.get(val)
            or KEYWORDS_HQL.get(val)
            or KEYWORDS.get(val, tokens.Name)), value
SQL_REGEX = {
    'root': [
        (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
        (r'GROUP\s+BY\b', tokens.Keyword),
        (r'[0-9_A-ZÀ-Ü][_$#\w]*', is_keyword),
        (r'[;:()\[\],\.]', tokens.Punctuation),
SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']]
# 输入
sql: select * from foo;
(Token.Keyword.DML, 'select')
(Token.Text.Whitespace, ' ')
(Token.Wildcard, '*')
(Token.Text.Whitespace, ' ')
(Token.Keyword, 'from')
(Token.Text.Whitespace, ' ')
(Token.Name, 'foo')
(Token.Punctuation, ';')

2.3 语句拆分(sqlparse.split)

  • 语句切分程序:filter_stack.py -> statement_splitter.py
class StatementSplitter:
    # stream: 词法解析器的generator object Lexer.get_tokens
        def process(self, stream):
        """Process the stream"""
        EOS_TTYPE = T.Whitespace, T.Comment.Single
        # Run over all stream tokens
        for ttype, value in stream:
            # Yield token if we finished a statement and there's no whitespaces
            # It will count newline token as a non whitespace. In this context
            # whitespace ignores newlines.
            # why don't multi line comments also count?
            # 新的一行。输出sql.Statement
            if self.consume_ws and ttype not in EOS_TTYPE:
                yield sql.Statement(self.tokens)
                # Reset filter and prepare to process next statement
                self._reset()
            # Change current split level (increase, decrease or remain equal)
            # level标示是否有语句嵌套,例如()、BEGIN、END等。
            self.level += self._change_splitlevel(ttype, value)
            # Append the token to the current statement
            self.tokens.append(sql.Token(ttype, value))
            # Check if we get the end of a statement
            # self.level<=0说明没有嵌套语句,标点符号;标示语句切分
            if self.level <= 0 and ttype is T.Punctuation and value == ';':
                self.consume_ws = True
        # Yield pending statement (if any)
        if self.tokens and not all(t.is_whitespace for t in self.tokens):
            yield sql.Statement(self.tokens)
# 输入
sql: select * from foo; select * from bar;
# 中间结果,存入self.tokens(词法解析结果)
Token.Keyword.DML select
Token.Text.Whitespace  
Token.Wildcard *
Token.Text.Whitespace  
Token.Keyword from
Token.Text.Whitespace  
Token.Name foo
Token.Punctuation ;
Token.Text.Whitespace  
Token.Keyword.DML select
Token.Text.Whitespace  
Token.Wildcard *
Token.Text.Whitespace  
Token.Keyword from
Token.Text.Whitespace  
Token.Name bar
Token.Punctuation ;
# 通过;切分后,sql.Statement(self.tokens)聚合,输出类型Statement。
# 最终输出:Statement类型转换为string
select * from foo; 
select * from bar;

2.4 语法解析(sqlparse.parse)

  • 语法解析器:filter_stack.py -> statement_splitter.py grouping.py
  • group代码:grouping.py。定义了各种SQL语法的解析规则,每种规则一般分为两部:
    • 识别语法相关语句,依赖token_next_by。
    • 调整语法树结构,依赖group_tokens。
# group语法规则
def group(stmt):
    for func in [
        group_comments,
        # _group_matching
        group_brackets,
        group_parenthesis,
        group_case,
        group_if,
        group_for,
        group_begin,
        group_functions,
        group_where,
        group_period,
        group_arrays,
        group_identifier,
        group_order,
        group_typecasts,
        group_tzcasts,
        group_typed_literal,
        group_operator,
        group_comparison,
        group_as,
        group_aliased,
        group_assignment,
        align_comments,
        group_identifier_list,
        group_values,
        func(stmt)
    return stmt
  • 语法解析token表示方式
# 类型
## :param token:
## :param i: Class or Tuple/List of Classes. 对应sql.py
## :param m: Tuple of TokenType & Value. Can be list of Tuple for multiple. 对应token.py + value
## :param t: TokenType or Tuple/List of TokenTypes. 对应tokens.py
# sql.py
## 基本类型NameAliasMixin/Token/TokenList。其他类型都是基于他们定义。
class Statement(TokenList):
    """Represents a SQL statement."""        
class Identifier(NameAliasMixin, TokenList):
    """Represents an identifier. Identifiers may have aliases or typecasts."""
class IdentifierList(TokenList):
    """A list of :class:`~sqlparse.sql.Identifier`\'s."""
class Where(TokenList):
    """A WHERE clause."""
class Comparison(TokenList):
    """A comparison used for example in WHERE clauses."""
class Having(TokenList):
    """A HAVING clause."""
  • group样例可以看出group的作用。本样例使用group_parenthesis对()进行了group。
# 输入
sql: select (select (x3) x2) and (y2) bar
# StatementSplitter().process输出结果
|- 0 DML 'select'
|- 1 Whitespace ' '
|- 2 Punctuation '('
|- 3 DML 'select'
|- 4 Whitespace ' '
|- 5 Punctuation '('
|- 6 Name 'x3'
|- 7 Punctuation ')'
|- 8 Whitespace ' '
|- 9 Name 'x2'
|- 10 Punctuation ')'
|- 11 Whitespace ' '
|- 12 Keyword 'and'
|- 13 Whitespace ' '
|- 14 Punctuation '('
|- 15 Name 'y2'
|- 16 Punctuation ')'
|- 17 Whitespace ' '
`- 18 Name 'bar'
# group之后的结果,形成了语法树层次。
|- 0 DML 'select'
|- 1 Whitespace ' '
|- 2 Parenthesis '(selec...'  ### sql.Parenthesis
|  |- 0 Punctuation '('       ### token
|  |- 1 DML 'select'
|  |- 2 Whitespace ' '
|  |- 3 Identifier '(x3) x2'  ### sql.Identifier
|  |  |- 0 Parenthesis '(x3)' ### sql.Parenthesis
|  |  |  |- 0 Punctuation '(' ### T.Punctuation
|  |  |  |- 1 Identifier 'x3'
|  |  |  |  `- 0 Name 'x3'
|  |  |  `- 2 Punctuation ')'
|  |  |- 1 Whitespace ' '
|  |  `- 2 Identifier 'x2'
|  |     `- 0 Name 'x2'
|  `- 4 Punctuation ')'
|- 3 Whitespace ' '
|- 4 Keyword 'and'
|- 5 Whitespace ' '
`- 6 Identifier '(y2) b...'
   |- 0 Parenthesis '(y2)'
   |  |- 0 Punctuation '('
   |  |- 1 Identifier 'y2'
   |  |  `- 0 Name 'y2'
   |  `- 2 Punctuation ')'
   |- 1 Whitespace ' '
   `- 2 Identifier 'bar'
      `- 0 Name 'bar'
  • parse样例:
# 输入
sql: select a0, b0, c0, d0, e0 from (select * from dual) q0 where 1=1 and 2=2
"|- 0 DML 'select'",
"|- 1 Whitespace ' '",
"|- 2 IdentifierList 'a0, b0...'",
"|  |- 0 Identifier 'a0'",
"|  |  `- 0 Name 'a0'",
"|  |- 1 Punctuation ','",
"|  |- 2 Whitespace ' '",
"|  |- 3 Identifier 'b0'",
"|  |  `- 0 Name 'b0'",
"|  |- 4 Punctuation ','",
"|  |- 5 Whitespace ' '",
"|  |- 6 Identifier 'c0'",
"|  |  `- 0 Name 'c0'",
"|  |- 7 Punctuation ','",
"|  |- 8 Whitespace ' '",
"|  |- 9 Identifier 'd0'",
"|  |  `- 0 Name 'd0'",
"|  |- 10 Punctuation ','",
"|  |- 11 Whitespace ' '",
"|  `- 12 Float 'e0'",
"|- 3 Whitespace ' '",
"|- 4 Keyword 'from'",
"|- 5 Whitespace ' '",
"|- 6 Identifier '(selec...'",
"|  |- 0 Parenthesis '(selec...'",
"|  |  |- 0 Punctuation '('",
"|  |  |- 1 DML 'select'",
"|  |  |- 2 Whitespace ' '",
"|  |  |- 3 Wildcard '*'",
"|  |  |- 4 Whitespace ' '",
"|  |  |- 5 Keyword 'from'",
"|  |  |- 6 Whitespace ' '",
"|  |  |- 7 Identifier 'dual'",
"|  |  |  `- 0 Name 'dual'",
"|  |  `- 8 Punctuation ')'",
"|  |- 1 Whitespace ' '",
"|  `- 2 Identifier 'q0'",
"|     `- 0 Name 'q0'",
"|- 7 Whitespace ' '",
"`- 8 Where 'where ...'",
"   |- 0 Keyword 'where'",
"   |- 1 Whitespace ' '",
"   |- 2 Comparison '1=1'",
"   |  |- 0 Integer '1'",
"   |  |- 1 Comparison '='",
"   |  `- 2 Integer '1'",
"   |- 3 Whitespace ' '",
"   |- 4 Keyword 'and'",
"   |- 5 Whitespace ' '",
"   `- 6 Comparison '2=2'",
"      |- 0 Integer '2'",
"      |- 1 Comparison '='",
"      `- 2 Integer '2'",
""

2.5 SQL格式化(sqlparse.format)

  • 格式化
# 根据options参数,设定self.preprocess self.stmtprocess self.postprocess回调
def build_filter_stack(stack, options)
# StatementSplitter().process前后进行filter处理
class FilterStack:
    def run(self, sql, encoding=None):
        stream = lexer.tokenize(sql, encoding)
        # Process token stream
        for filter_ in self.preprocess:
            stream = filter_.process(stream)
        stream = StatementSplitter().process(stream)
        # Output: Stream processed Statements
        for stmt in stream:
            if self._grouping:
                stmt = grouping.group(stmt)
            for filter_ in self.stmtprocess:
                filter_.process(stmt)
            for filter_ in self.postprocess:
                stmt = filter_.process(stmt)
            yield stmt
  • 使用reindent方式格式化后的样例:
# 输入
SQL: select * from foo, bar where bar.id = foo.bar_id having sum(bar.value) > 100