# https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
# tokenzier
# http://www.unicode.org/reports/tr44/#GC_Values_Table
import unicodedata
unicodedata.unidata_version
'12.1.0'
# unicode类型枚举
# https://www.fileformat.info/info/unicode/category/index.htm
print(unicodedata.category('.'))
print(unicodedata.category('-'))
print(unicodedata.category(','))
print(unicodedata.category(' '))
#unicodedata.category('a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎')
for ch in 'a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎':
print(unicodedata.category(ch))
Po
Pd
Po
Zs
Ll
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
# http://c.biancheng.net/c/ascii/
# ord()返回十进制
print(ord(' '))
print(ord('\n'))
print(ord('\r'))
print(ord('\t'))
32
10
13
9
# 判断空格
def is_space(ch):
"""空格类字符判断判断。
空格字符包括:' ', '\n', '\t', 'r'
"""
return ch == ' ' or \
ch == '\t' or \
ch == '\r' or \
ch == '\n' or \
unicodedata.category(ch) == 'Zs' # [Zs] Separator, Space
print(is_space(' '))
print(is_space('\n'))
print(is_space('\r'))
print(is_space('\t'))
print(is_space('A'))
True
True
True
True
False
# 判断标点符号
def is_punctuation(ch):
"""标点符号类字符判断(包含全/半角)。
英文标点符号是半角,中文标点符号是全角。全角占一个字符,半角占两个字符。
[33, 47] ! " # $ % & ' ( ) * + , - . /
[58, 64] : ; < = > ? @
[91, 96] [ \ ] ^ _ `
[123, 126] { | } ~
unicodedata.category.(ch).startswith('P')
# https://www.fileformat.info/info/unicode/category/index.htm]
# 包含了所有标点符号[Pc][Pd][Pe][Pf][Pi][Po][Ps]
"""
code = ord(ch)
return 33 <= code <= 47 or \
58 <= code <= 64 or \
91 <= code <= 96 or \
123 <= code <= 126 or \
unicodedata.category(ch).startswith('P')
is_punctuation('?') # 中文?号
unicodedata.category('∫')
'Sm'
def is_control(ch):
"""控制类字符判断
https://en.wikipedia.org/wiki/Control_character
https://www.fileformat.info/info/unicode/category/Cc/index.htm
https://www.fileformat.info/info/unicode/category/Cf/index.htm
"""
return unicodedata.category(ch) in ('Cc', 'Cf')
def is_cjk_character(ch):
"""CJK类字符判断(包括中文字符也在此列)
参考:https://en.wikipedia.org/wiki/Unicode_block
# CJK Unified Ideographs, HAN
# CJK Unified Ideographs Extension A, HAN
# General Punctuation
# Supplemental Mathematical Operators
# Miscellaneous Symbols and Arrows
# Miscellaneous Symbols and Arrows
# CJK Compatibility Ideographs, HAN
# CJK Compatibility Ideographs Supplement, HAN
"""
code = ord(ch)
return 0x4E00 <= code <= 0x9FFF or \
0x3400 <= code <= 0x4DBF or \
0x20000 <= code <= 0x2A6DF or \
0x2A700 <= code <= 0x2B73F or \
0x2B740 <= code <= 0x2B81F or \
0x2B820 <= code <= 0x2CEAF or \
0xF900 <= code <= 0xFAFF or \
0x2F800 <= code <= 0x2FA1F
"""分词器
"""
origin_text = "a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎b是特殊Mn字符123.4! 56~ jerry! 数学符号🤌中国USTC"
text = origin_text.lower()
print(f'Origin: {text} : {len(text)}')
# 由于存在Mn字符等一些特殊字符,先规范化normalize,NFD规范分解形式
text = unicodedata.normalize('NFD', text)
print(f'Normalize: {text} : {len(text)}')
# 删除Mn字符
text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
print(f'Text: {text} : {len(text)}')
# 空格分隔文本
spaced = ''
for ch in text:
# 标点符合和cjk字符,前后空格分隔
if is_punctuation(ch) or is_cjk_character(ch):
spaced += ' ' + ch + ' '
# 空格,即置空格
elif is_space(ch):
spaced += ' '
# 删除0(NULL),0xfffd,控制字符,这些字符均是不可见,无法显示的
elif ord(ch) == 0 or ord(ch) == 0xfffd or is_control(ch):
continue
# 数字/英文字母/数学符号,直接拼接
else:
spaced += ch
print(f'Spaced text: {spaced}')
Origin: a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎b是特殊mn字符123.4! 56~ jerry! 数学符号🤌中国ustc : 57
Normalize: a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎b是特殊mn字符123.4! 56~ jerry! 数学符号🤌中国ustc : 57
Text: ab是特殊mn字符123.4! 56~ jerry! 数学符号🤌中国ustc : 38
Spaced text: ab 是 特 殊 mn 字 符 123 . 4 ! 56 ~ jerry ! 数 学 符 号 🤌 中 国 ustc
!wget https://huggingface.co/hfl/chinese-roberta-wwm-ext/blob/main/vocab.txt
--2021-08-23 02:00:28-- https://huggingface.co/hfl/chinese-roberta-wwm-ext/blob/main/vocab.txt
Resolving huggingface.co (huggingface.co)... 34.200.164.230, 54.84.221.171, 34.195.144.223, ...
Connecting to huggingface.co (huggingface.co)|34.200.164.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4564022 (4.4M) [text/html]
Saving to: ‘vocab.txt’
vocab.txt 100%[===================>] 4.35M 2.58MB/s in 1.7s
2021-08-23 02:00:32 (2.58 MB/s) - ‘vocab.txt’ saved [4564022/4564022]
def load_vocab(vocab_path):
tokens_dict = {}
with open(vocab_path, encoding='utf-8') as fp:
for line in fp:
token = line.split()
token = token[0] if token else line.strip()
tokens_dict[token] = len(tokens_dict)
return tokens_dict
tokens_dict = load_vocab('../../../cache/vocab.txt')
def word_piece_tokenize(word):
"""word内分成subword, _word_maxlen=200
"""
if len(word) > 200:
return [word]
tokens, start, end = [], 0, 0
while start < len(word):
end = len(word)
while end > start:
sub = word[start:end]
if start > 0:
sub = '##' + sub
if sub in tokens_dict:
break
end -= 1
if start == end:
return [word]
else:
tokens.append(sub)
start = end
return tokens
tokens = []
for word in spaced.strip().split():
print(word)
tokens.extend(word_piece_tokenize(word))
ab
是
特
殊
mn
字
符
123
.
4
!
56
~
jerry
!
数
学
符
号
🤌
中
国
ustc
len(tokens)
25
Rematch算法
解决原文本context中answer_start_id,到tokenizer化以后在tokens列表中的index映射。
mapping = [[], [0], [1], [2]]
def rematch(text, tokens):
"""给出原始的text和tokenize后的tokens的映射关系
"""
text = text.lower()
normalized_text, char_mapping = '', []
for i, ch in enumerate(text):
if True:
ch = unicodedata.normalize('NFD', ch)
ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn'])
ch = ''.join([
c for c in ch
if not (ord(c) == 0 or ord(c) == 0xfffd or is_control(c))
])
normalized_text += ch
char_mapping.extend([i] * len(ch))
text, token_mapping, offset = normalized_text, [], 0
print(text)
for token in tokens:
start = text[offset:].index(token) + offset
end = start + len(token)
token_mapping.append(char_mapping[start:end])
offset = end
return token_mapping