(博客主亲自录制视频教程)
QQ:231469242 原创,版权所有
# -*- coding: utf-8 -*-"""Created on Tue Jan 10 16:28:08 2017@author: Administrator专门过滤垃圾词汇的脚本"""import string,nltk,refrom nltk.corpus import stopwords#自己准备一篇测试文档#fileName="article.txt"#停止词,英语停止词一共153个list_stopWords=set(stopwords.words('english'))#用于测试list_test=['a','!','b','--','c','10','.','a','b']#所有字母list_letters=string.ascii_letters'''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'''#所有数字list_digits=string.digits'''0123456789'''#包含字母和数字的列表list_digits_letters=list_letters+list_digits#所有标点符号list_punctuation=string.punctuation'''!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'''#所有空白list_whiteSpace=string.whitespace'''\t\n\r\x0b\x0c'''#大小写转换def Text_lower(text): return text.lower()#把单词列表的标点,空白def word_punctuation_process(list_words): badChars=string.whitespace+string.punctuation for word in list_words: if word in badChars: #remove bad word or stop words #print("bad char:",word) list_words.remove(word) return list_words#处理正则表达式:非字母或数字def Remove_badMarks(list_words): for word in list_words: if Detect_badCharecter(word)==True: #print("bad char:",word) list_words.remove(word) return list_words#删除停止词def stopWords_process(list_words): list_filtered_words=[w for w in list_words if w not in list_stopWords] return list_filtered_words #去除怪异符号,只要包含字母或数字,就为Truedef Detect_badCharecter(word): for i in word: #如果单词里字符包含字母或数字,就不是怪异符号,返回False if i in list_digits_letters: return False else: return True#单词长度是否大于3def Word_length(word): if len(word)>=3: return True #处理数字的脚本 #表示"1989","3","58"等等字符串数字def If_re0_d(variable): #排除字符串类型 if type(variable)==str: re1=re.compile(r"\d*") mo1=re1.search(variable) if mo1!=None and mo1.group()==variable: return True else: return False #排除整数型数字,例如1989,2,33def If_re1_d(variable): if type(variable)==int: return True else: return False #清除代理商邮箱列表的特定格式数字def If_digit(variable): if If_re0_d(variable)==True or If_re1_d(variable)==True: return True def Detect_num(word): for i in word: if If_digit(i)==True: return True else: return False #最终处理函数,把一篇TXT文档转换为一个单词列表,输入参数文件名def Get_listOfWords(fileName): file=open("article.txt",'r') str_text=file.read() #文章字符串转换为小写 str_lowerText=Text_lower(str_text) #分句,文字内容分为句子列表 list_sentences=nltk.sent_tokenize(str_lowerText) #分词,文章单词生成一个列表 list_word_article=nltk.word_tokenize(str_lowerText) #去除标点 list_removepunctuation=word_punctuation_process(list_word_article) #去除怪异符号,例如“--” list_removeBadMarks=Remove_badMarks(list_removepunctuation) #单词长度小于3的过滤 list_length_Less3=[w for w in list_removeBadMarks if Word_length(w)] #去除停止词 list_remove_stopWords=stopWords_process(list_length_Less3) #去除数字,但涉及化学分子式的不要轻易使用此函数,否则分子式会被过滤掉 list_remove_nums=[w for w in list_remove_stopWords if not Detect_num(w)] list_words=list_remove_nums return list_words #list_words=Get_listOfWords(fileName)