2022年最值得收藏的 25 个 Python 文本处理案例!
- 1提取PDF内容
- 2提取Word内容
- 3提取Web网页内容
- 4读取json数据
- 5读取CSV数据
- 6删除字符串中的标点符号
- 7使用NLTK删除停用词
- 8使用TextBlob更正拼写
- 9使用NLTK和TextBlob的词标记化
- 10使用NLTK提取句子单词或短语的词干列表
- 11使用NLTK进行句子或短语词形还原
- 12使用NLTK从文本文件中查找每个单词的频率
- 13从语料库中创建词云
- 14NLTK词法散布图
- 15使用countvectorizer将文本转换为数字
- 16使用TF-IDF创建文档术语矩阵
- 17为给定句子生成N-gram
- 18使用带有二元组的sklearnCountVectorize词汇规范
- 19使用TextBlob提取名词短语
- 20如何计算词-词共现矩阵
- 21使用TextBlob进行情感分析
- 22使用Goslate进行语言翻译
- 23使用TextBlob进行语言检测和翻译
- 24使用TextBlob获取定义和同义词
- 25使用TextBlob获取反义词列表
1提取 PDF 内容
- # pip install PyPDF2 安装 PyPDF2
- import PyPDF2
- from PyPDF2 import PdfFileReader
- # Creating a pdf file object.
- pdf = open("test.pdf", "rb")
- # Creating pdf reader object.
- pdf_reader = PyPDF2.PdfFileReader(pdf)
- # Checking total number of pages in a pdf file.
- print("Total number of Pages:", pdf_reader.numPages)
- # Creating a page object.
- page = pdf_reader.getPage(200)
- # Extract data from a specific page number.
- print(page.extractText())
- # Closing the object.
- pdf.close()
2提取 Word 内容
- # pip install python-docx 安装 python-docx
- import docx
- def main():
- try:
- doc = docx.Document('test.docx') # Creating word reader object.
- data = ""
- fullText = []
- for para in doc.paragraphs:
- fullText.append(para.text)
- data = '\n'.join(fullText)
- print(data)
- except IOError:
- print('There was an error opening the file!')
- return
- if __name__ == '__main__':
- main()
3提取 Web 网页内容
- # pip install bs4 安装 bs4
- from urllib.request import Request, urlopen
- from bs4 import BeautifulSoup
- req = Request('http://www.cmegroup.com/trading/products/#sortField=oi&sortAsc=false&venues=3&page=1&cleared=1&group=1',
- headers={'User-Agent': 'Mozilla/5.0'})
- webpage = urlopen(req).read()
- # Parsing
- soup = BeautifulSoup(webpage, 'html.parser')
- # Formating the parsed html file
- strhtm = soup.prettify()
- # Print first 500 lines
- print(strhtm[:500])
- # Extract meta tag value
- print(soup.title.string)
- print(soup.find('meta', attrs={'property':'og:description'}))
- # Extract anchor tag value
- for x in soup.find_all('a'):
- print(x.string)
- # Extract Paragraph tag value
- for x in soup.find_all('p'):
- print(x.text)
4读取 Json 数据
- import requests
- import json
- r = requests.get("https://support.oneskyapp.com/hc/en-us/article_attachments/202761727/example_2.json")
- res = r.json()
- # Extract specific node content.
- print(res['quiz']['sport'])
- # Dump data as string
- data = json.dumps(res)
- print(data)
5读取 CSV 数据
- import csv
- with open('test.csv','r') as csv_file:
- reader =csv.reader(csv_file)
- next(reader) # Skip first row
- for row in reader:
- print(row)
- import re
- import string
- data = "Stuning even for the non-gamer: This sound track was beautiful!\
- It paints the senery in your mind so well I would recomend\
- it even to people who hate vid. game music! I have played the game Chrono \
- Cross but out of all of the games I have ever played it has the best music! \
- It backs away from crude keyboarding and takes a fresher step with grate\
- guitars and soulful orchestras.\
- It would impress anyone who cares to listen!"
- # Methood 1 : Regex
- # Remove the special charaters from the read string.
- no_specials_string = re.sub('[!#?,.:";]', '', data)
- print(no_specials_string)
- # Methood 2 : translate()
- # Rake translator object
- translator = str.maketrans('', '', string.punctuation)
- data = data.translate(translator)
- print(data)
7使用 NLTK 删除停用词
- from nltk.corpus import stopwords
- data = ['Stuning even for the non-gamer: This sound track was beautiful!\
- It paints the senery in your mind so well I would recomend\
- it even to people who hate vid. game music! I have played the game Chrono \
- Cross but out of all of the games I have ever played it has the best music! \
- It backs away from crude keyboarding and takes a fresher step with grate\
- guitars and soulful orchestras.\
- It would impress anyone who cares to listen!']
- # Remove stop words
- stopwords = set(stopwords.words('english'))
- output = []
- for sentence in data:
- temp_list = []
- for word in sentence.split():
- if word.lower() not in stopwords:
- temp_list.append(word)
- output.append(' '.join(temp_list))
- print(output)
8使用 TextBlob 更正拼写
- from textblob import TextBlob
- data = "Natural language is a cantral part of our day to day life, and it's so antresting to work on any problem related to langages."
- output = TextBlob(data).correct()
- print(output)
9使用 NLTK 和 TextBlob 的词标记化
- import nltk
- from textblob import TextBlob
- data = "Natural language is a central part of our day to day life, and it's so interesting to work on any problem related to languages."
- nltk_output = nltk.word_tokenize(data)
- textblob_output = TextBlob(data).words
- print(nltk_output)
- print(textblob_output)
['Natural', 'language', 'is', 'a', 'central', 'part', 'of', 'our', 'day', 'to', 'day', 'life', ',', 'and', 'it', "'s", 'so', 'interesting', 'to', 'work', 'on', 'any', 'problem', 'related', 'to', 'languages', '.']
['Natural', 'language', 'is', 'a', 'central', 'part', 'of', 'our', 'day', 'to', 'day', 'life', 'and', 'it', "'s", 'so', 'interesting', 'to', 'work', 'on', 'any', 'problem', 'related', 'to', 'languages']
10使用 NLTK 提取句子单词或短语的词干列表
- from nltk.stem import PorterStemmer
- st = PorterStemmer()
- text = ['Where did he learn to dance like that?',
- 'His eyes were dancing with humor.',
- 'She shook her head and danced away',
- 'Alex was an excellent dancer.']
- output = []
- for sentence in text:
- output.append(" ".join([st.stem(i) for i in sentence.split()]))
- for item in output:
- print(item)
- print("-" * 50)
- print(st.stem('jumping'), st.stem('jumps'), st.stem('jumped'))
where did he learn to danc like that?
hi eye were danc with humor.
she shook her head and danc away
alex wa an excel dancer.
jump jump jump
11使用 NLTK 进行句子或短语词形还原
- from nltk.stem import WordNetLemmatizer
- wnl = WordNetLemmatizer()
- text = ['She gripped the armrest as he passed two cars at a time.',
- 'Her car was in full view.',
- 'A number of cars carried out of state license plates.']
- output = []
- for sentence in text:
- output.append(" ".join([wnl.lemmatize(i) for i in sentence.split()]))
- for item in output:
- print(item)
- print("*" * 10)
- print(wnl.lemmatize('jumps', 'n'))
- print(wnl.lemmatize('jumping', 'v'))
- print(wnl.lemmatize('jumped', 'v'))
- print("*" * 10)
- print(wnl.lemmatize('saddest', 'a'))
- print(wnl.lemmatize('happiest', 'a'))
- print(wnl.lemmatize('easiest', 'a'))
She gripped the armrest a he passed two car at a time.
Her car wa in full view.
A number of car carried out of state license plates.
12使用 NLTK 从文本文件中查找每个单词的频率
- import nltk
- from nltk.corpus import webtext
- from nltk.probability import FreqDist
- nltk.download('webtext')
- wt_words = webtext.words('testing.txt')
- data_analysis = nltk.FreqDist(wt_words)
- # Let's take the specific words only if their frequency is greater than 3.
- filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
- for key in sorted(filter_words):
- print("%s: %s" % (key, filter_words[key]))
- data_analysis = nltk.FreqDist(filter_words)
- data_analysis.plot(25, cumulative=False)
[nltk_data] Downloading package webtext to
[nltk_data] C:\Users\amit\AppData\Roaming\nltk_data...
[nltk_data] Unzipping corpora\webtext.zip.
1989: 1
Accessing: 1
Analysis: 1
Anyone: 1
Chapter: 1
Coding: 1
Data: 1
- import nltk
- from nltk.corpus import webtext
- from nltk.probability import FreqDist
- from wordcloud import WordCloud
- import matplotlib.pyplot as plt
- nltk.download('webtext')
- wt_words = webtext.words('testing.txt') # Sample data
- data_analysis = nltk.FreqDist(wt_words)
- filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
- wcloud = WordCloud().generate_from_frequencies(filter_words)
- # Plotting the wordcloud
- plt.imshow(wcloud, interpolation="bilinear")
- plt.axis("off")
- (-0.5, 399.5, 199.5, -0.5)
- plt.show()