#!/usr/bin/python
# -*- coding: utf-8 -*-
"""expose_mb.py v1.0 written by fuku@rouge.gr.jp
SQL から多バイト文字を使用している識別子を抽出する

使い方:
    ./expose_mb.py << EOF
    -- comment
    SELECT 名前 FROM 表;
    /* comment */
    EOF
結果:
    2: 名前,表
"""
import sqlparse

def expose_mb(sqlText: str) -> list:

    # 文字位置 -> 行番号の辞書を作成
    cPos2linCnt = {}
    lineCnt = 1
    for i, c in enumerate(sqlText):
        cPos2linCnt[i] = lineCnt
        lineCnt += 1 if c == '\n' else 0

    exposed = {}

    searchOffset = 0

    for stmt in sqlparse.parse(sqlText):

        for token in stmt.flatten():

            if token.ttype == sqlparse.tokens.Name: # 識別子を対象とする

                word = token.value

                # 識別子に多バイト文字があるか検査
                if any(ord(c) > 0x7F for c in word):

                    # 識別子の存在位置を原文から探す
                    cPos = sqlText.find(word, searchOffset)

                    linCnt = cPos2linCnt.get(cPos)  # 行番号を得る

                    if linCnt is None:
                        # 多バイト文字のデコードにバグがなければ
                        # ここに来ない
                        raise RuntimeError(f'offset {cPos} is not found.')

                    # 出現行ごとに識別子をまとめる
                    if linCnt not in exposed:
                        exposed[linCnt] = []
                    if word not in exposed[linCnt]:
                        exposed[linCnt].append(word)

                    searchOffset = cPos + len(word)

    # 行番号順にまとめる
    exposedList = []
    for linCnt in sorted(exposed.keys()):
        exposedList.append((linCnt, exposed[linCnt]))

    return exposedList

if __name__ == '__main__':

    import sys

    sqlText = sys.stdin.read()
    exposedList = expose_mb(sqlText)

    if exposedList == []:
        print('多バイト文字の識別子が見つかりません.')
    else:
        for linCnt, expLst in exposedList:
            print(f"{linCnt}: {','.join(expLst)}")