角木蛟要学习

import re, collections

# text中所有单词转为小写形式并匹配
# text示例："The Project Gutenberg EBook of The Adventures of Sherlock Holmes\nby Sir Arthur Conan Doyle"
def tokens(text):
return re.findall("[a-z]+", text.lower())

# 打开语料库文件
with open('big.txt', 'r') as f:
words = tokens(f.read())
# 统计语料库中每个单词的个数
word_counts = collections.Counter(words)

# text示例：'fianlly helloa'
def correct_text_generic(text):
"""
纠正匹配到的单词中所有拼写错误的单词
"""
return re.sub('[a-zA-Z]+', correct_match, text)

def correct_match(match):
"""
替换的回调函数
"""

word = match.group()

def case_of(text):
"""
返回小写
"""
return (str.upper if text.isupper() else
str.lower if text.islower() else
str.title if text.istitle() else
str)

return case_of(word)(correct(word.lower()))

def correct(word):
"""
获得输入单词的最佳正确拼写
"""
# 1.是否在语料库中出现

# 2.单词的优先顺序：编辑距离为0的单词（即该单词本身） > 编辑距离为1的单词 > 编辑距离为2的单词

# 3.在语料库中的出现次数

candidates = (known(edits0(word)) or
known(edits1(word)) or
known(edits2(word)) or
{word})
return max(candidates, key=word_counts.get)

def known(words):
"""
判断words中的每一个单词是否在语料库中出现，若出现就返回此单词
"""
return {w for w in words if w in word_counts}

def edits0(word):
"""
返回跟输入单词是0距离的单词,也就是自己
"""
return {word}

def edits1(word):
"""
返回跟输入单词是1距离的单词
"""
# 26个英文字母 ord():获取'a'的码 chr():通过码还原对应的字符
alphabet = ''.join([chr(ord('a') + i) for i in range(26)])

def splits(word):
"""
分割单词以cta为例： ("","cat") ("c","at") ("ca","") ("cat","")
"""
return [(word[:i], word[i:])
for i in range(len(word) + 1)]

# 分割好的单词
pairs = splits(word)

# 删除某个字符
deletes = [a + b[1:] for (a, b) in pairs if b]
# 两个字符换位置
transposes = [a + b[1] + b[0] + b[2:] for (a, b) in pairs if len(b) > 1]
# 替换某个字符
replaces = [a + c + b[1:] for (a, b) in pairs for c in alphabet if b]
# 插入某个字符
inserts = [a + c + b for (a, b) in pairs for c in alphabet]
# 返回集合
return set(deletes + transposes + replaces + inserts)

def edits2(word):
"""
返回跟输入单词是2距离的单词
寻找跟word编辑距离为1的单词的编辑距离为1的单词就是编辑距离为2的单词
"""

return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

【Python】英语单词拼写检查错误的相关函数

评论列表，共 0 条评论

© 2023 - good good study day day up - 角木蛟要学习

因为喜欢，可迎万难