文章目录
原文记录内容太多现进行摘录和分类
python3的文本处理
jieba库的使用
pip3 install jieba
统计hamlet.txt文本中高频词的个数
kou@ubuntu:~/python$ cat ClaHamlet.py
#!/usr/bin/env python
# coding=utf-8
#e10.1CalHamlet.py
def getText():
txt = open("hamlet.txt", "r").read()
txt = txt.lower()
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt = txt.replace(ch, " ") #将文本中特殊字符替换为空格
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items[i]
print ("{0:<10}{1:>5}".format(word, count))
统计三国演义任务高频次数
#!/usr/bin/env python
# coding=utf-8
#e10.1CalHamlet.py
def getText():
txt = open("hamlet.txt", "r").read()
txt = txt.lower()
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt = txt.replace(ch, " ") #将文本中特殊字符替换为空格
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items[i]
print ("{0:<10}{1:>5}".format(word, count))
爬虫
学习资源是中国大学mooc的爬虫课程。《嵩天老师》
下面写几个简单的代码!熟悉这几个代码的书写以后基本可以完成需求!
爬取百度首页
import requests
r = requests.get("https://www.baidu.com")
fo = open("baidu.txt", "w+")
r.encoding = 'utf-8'
str = r.text
line = fo.write( str )
爬取京东某手机页面
import requests
url = "https://item.jd.com/2967929.html"
try:
r = requests.get(url)
r.raise_for_status()//如果不是200就会报错
r.encoding = r.apparent_encoding//转utf-8格式
print(r.text[:1000])//只有前1000行
except:
print("False")
fo.close()
BeautifulSoup
使用request进行爬取,在使用 BeautifulSoup进行处理!拥有一个更好的排版
fo = open("jingdong.md","w")
url = "https://item.jd.com/2967929.html"
try:
r = requests.get(url)
r.encoding = r.apparent_encoding
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
fo.write(soup.prettify())
fo.writelines(soup.prettify())
except:
print("False")
fo.close()
BeautifulSoup爬取百度首页
fo = open("baidu.md","w")
try:
r = requests.get("https://www.baidu.com")
r.encoding = r.apparent_encoding
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
fo.write(soup.prettify())
fo.writelines(soup.prettify())
except:
print("False")
fo.close()