2022年最值得收藏的 25 个 Python 文本处理案例!
目录
- 1提取PDF内容
- 2提取Word内容
- 3提取Web网页内容
- 4读取json数据
- 5读取CSV数据
- 6删除字符串中的标点符号
- 7使用NLTK删除停用词
- 8使用TextBlob更正拼写
- 9使用NLTK和TextBlob的词标记化
- 10使用NLTK提取句子单词或短语的词干列表
- 11使用NLTK进行句子或短语词形还原
- 12使用NLTK从文本文件中查找每个单词的频率
- 13从语料库中创建词云
- 14NLTK词法散布图
- 15使用countvectorizer将文本转换为数字
- 16使用TF-IDF创建文档术语矩阵
- 17为给定句子生成N-gram
- 18使用带有二元组的sklearnCountVectorize词汇规范
- 19使用TextBlob提取名词短语
- 20如何计算词-词共现矩阵
- 21使用TextBlob进行情感分析
- 22使用Goslate进行语言翻译
- 23使用TextBlob进行语言检测和翻译
- 24使用TextBlob获取定义和同义词
- 25使用TextBlob获取反义词列表
1提取 PDF 内容
- # pip install PyPDF2 安装 PyPDF2
- import PyPDF2
- from PyPDF2 import PdfFileReader
- # Creating a pdf file object.
- pdf = open("test.pdf", "rb")
- # Creating pdf reader object.
- pdf_reader = PyPDF2.PdfFileReader(pdf)
- # Checking total number of pages in a pdf file.
- print("Total number of Pages:", pdf_reader.numPages)
- # Creating a page object.
- page = pdf_reader.getPage(200)
- # Extract data from a specific page number.
- print(page.extractText())
- # Closing the object.
- pdf.close()
2提取 Word 内容
- # pip install python-docx 安装 python-docx
- import docx
- def main():
- try:
- doc = docx.Document('test.docx') # Creating word reader object.
- data = ""
- fullText = []
- for para in doc.paragraphs:
- fullText.append(para.text)
- data = '\n'.join(fullText)
- print(data)
- except IOError:
- print('There was an error opening the file!')
- return
- if __name__ == '__main__':
- main()
3提取 Web 网页内容
- # pip install bs4 安装 bs4
- from urllib.request import Request, urlopen
- from bs4 import BeautifulSoup
- req = Request('http://www.cmegroup.com/trading/products/#sortField=oi&sortAsc=false&venues=3&page=1&cleared=1&group=1',
- headers={'User-Agent': 'Mozilla/5.0'})
- webpage = urlopen(req).read()
- # Parsing
- soup = BeautifulSoup(webpage, 'html.parser')
- # Formating the parsed html file
- strhtm = soup.prettify()
- # Print first 500 lines
- print(strhtm[:500])
- # Extract meta tag value
- print(soup.title.string)
- print(soup.find('meta', attrs={'property':'og:description'}))
- # Extract anchor tag value
- for x in soup.find_all('a'):
- print(x.string)
- # Extract Paragraph tag value
- for x in soup.find_all('p'):
- print(x.text)
4读取 Json 数据
- import requests
- import json
- r = requests.get("https://support.oneskyapp.com/hc/en-us/article_attachments/202761727/example_2.json")
- res = r.json()
- # Extract specific node content.
- print(res['quiz']['sport'])
- # Dump data as string
- data = json.dumps(res)
- print(data)
5读取 CSV 数据
- import csv
- with open('test.csv','r') as csv_file:
- reader =csv.reader(csv_file)
- next(reader) # Skip first row
- for row in reader:
- print(row)
6删除字符串中的标点符号
- import re
- import string
- data = "Stuning even for the non-gamer: This sound track was beautiful!\
- It paints the senery in your mind so well I would recomend\
- it even to people who hate vid. game music! I have played the game Chrono \
- Cross but out of all of the games I have ever played it has the best music! \
- It backs away from crude keyboarding and takes a fresher step with grate\
- guitars and soulful orchestras.\
- It would impress anyone who cares to listen!"
- # Methood 1 : Regex
- # Remove the special charaters from the read string.
- no_specials_string = re.sub('[!#?,.:";]', '', data)
- print(no_specials_string)
- # Methood 2 : translate()
- # Rake translator object
- translator = str.maketrans('', '', string.punctuation)
- data = data.translate(translator)
- print(data)
7使用 NLTK 删除停用词
- from nltk.corpus import stopwords
- data = ['Stuning even for the non-gamer: This sound track was beautiful!\
- It paints the senery in your mind so well I would recomend\
- it even to people who hate vid. game music! I have played the game Chrono \
- Cross but out of all of the games I have ever played it has the best music! \
- It backs away from crude keyboarding and takes a fresher step with grate\
- guitars and soulful orchestras.\
- It would impress anyone who cares to listen!']
- # Remove stop words
- stopwords = set(stopwords.words('english'))
- output = []
- for sentence in data:
- temp_list = []
- for word in sentence.split():
- if word.lower() not in stopwords:
- temp_list.append(word)
- output.append(' '.join(temp_list))
- print(output)
8使用 TextBlob 更正拼写
- from textblob import TextBlob
- data = "Natural language is a cantral part of our day to day life, and it's so antresting to work on any problem related to langages."
- output = TextBlob(data).correct()
- print(output)
9使用 NLTK 和 TextBlob 的词标记化
- import nltk
- from textblob import TextBlob
- data = "Natural language is a central part of our day to day life, and it's so interesting to work on any problem related to languages."
- nltk_output = nltk.word_tokenize(data)
- textblob_output = TextBlob(data).words
- print(nltk_output)
- print(textblob_output)
Output:
['Natural', 'language', 'is', 'a', 'central', 'part', 'of', 'our', 'day', 'to', 'day', 'life', ',', 'and', 'it', "'s", 'so', 'interesting', 'to', 'work', 'on', 'any', 'problem', 'related', 'to', 'languages', '.']
['Natural', 'language', 'is', 'a', 'central', 'part', 'of', 'our', 'day', 'to', 'day', 'life', 'and', 'it', "'s", 'so', 'interesting', 'to', 'work', 'on', 'any', 'problem', 'related', 'to', 'languages']
10使用 NLTK 提取句子单词或短语的词干列表
- from nltk.stem import PorterStemmer
- st = PorterStemmer()
- text = ['Where did he learn to dance like that?',
- 'His eyes were dancing with humor.',
- 'She shook her head and danced away',
- 'Alex was an excellent dancer.']
- output = []
- for sentence in text:
- output.append(" ".join([st.stem(i) for i in sentence.split()]))
- for item in output:
- print(item)
- print("-" * 50)
- print(st.stem('jumping'), st.stem('jumps'), st.stem('jumped'))
Output:
where did he learn to danc like that?
hi eye were danc with humor.
she shook her head and danc away
alex wa an excel dancer.
--------------------------------------------------
jump jump jump
11使用 NLTK 进行句子或短语词形还原
- from nltk.stem import WordNetLemmatizer
- wnl = WordNetLemmatizer()
- text = ['She gripped the armrest as he passed two cars at a time.',
- 'Her car was in full view.',
- 'A number of cars carried out of state license plates.']
- output = []
- for sentence in text:
- output.append(" ".join([wnl.lemmatize(i) for i in sentence.split()]))
- for item in output:
- print(item)
- print("*" * 10)
- print(wnl.lemmatize('jumps', 'n'))
- print(wnl.lemmatize('jumping', 'v'))
- print(wnl.lemmatize('jumped', 'v'))
- print("*" * 10)
- print(wnl.lemmatize('saddest', 'a'))
- print(wnl.lemmatize('happiest', 'a'))
- print(wnl.lemmatize('easiest', 'a'))
Output:
She gripped the armrest a he passed two car at a time.
Her car wa in full view.
A number of car carried out of state license plates.
**********
jump
jump
jump
**********
sad
happy
easy
12使用 NLTK 从文本文件中查找每个单词的频率
- import nltk
- from nltk.corpus import webtext
- from nltk.probability import FreqDist
- nltk.download('webtext')
- wt_words = webtext.words('testing.txt')
- data_analysis = nltk.FreqDist(wt_words)
- # Let's take the specific words only if their frequency is greater than 3.
- filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
- for key in sorted(filter_words):
- print("%s: %s" % (key, filter_words[key]))
- data_analysis = nltk.FreqDist(filter_words)
- data_analysis.plot(25, cumulative=False)
Output:
[nltk_data] Downloading package webtext to
[nltk_data] C:\Users\amit\AppData\Roaming\nltk_data...
[nltk_data] Unzipping corpora\webtext.zip.
1989: 1
Accessing: 1
Analysis: 1
Anyone: 1
Chapter: 1
Coding: 1
Data: 1
...
13从语料库中创建词云
- import nltk
- from nltk.corpus import webtext
- from nltk.probability import FreqDist
- from wordcloud import WordCloud
- import matplotlib.pyplot as plt
- nltk.download('webtext')
- wt_words = webtext.words('testing.txt') # Sample data
- data_analysis = nltk.FreqDist(wt_words)
- filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
- wcloud = WordCloud().generate_from_frequencies(filter_words)
- # Plotting the wordcloud
- plt.imshow(wcloud, interpolation="bilinear")
- plt.axis("off")
- (-0.5, 399.5, 199.5, -0.5)
- plt.show()