<pre name="code" class="python"># -*- coding: utf-8 -*-
import urllib
import urllib.request
import re
from bs4 import BeautifulSoup
def qiubai():
page = 1
for page in range(1,10):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
url='http://www.qiushibaike.com/hot/page/'+str(page)
request = urllib.request.Request(url,headers = headers)
response = urllib.request.urlopen(request)
d = response.read()
soup = BeautifulSoup(d)
s = soup.find_all('div',{"class":"content"})
#d = soup.find_all('img')
#for i in range (0,len(s)):
# print ('%s\n'%s[i])
#for i in range (0,len(d)):
#print ('%s\n'%d[i])
f = dict()
#print (f[s[0]])
#for i in range (0,len(s)):
#f[s[i]] = d[i]
#print ('%s\n%s\n\n'%(s[i],f[s[i]]))
for i in range(0,s.__len__()):
s[i] = str(s[i])
st = ''.join(list(s))
con = r'<div class="content">(.*?)<!'
#con = '门(.*?)师傅'
#p = re.compile(con)
mi = re.findall(con,st,re.S) #re.S:.将会匹配换行符,默认.不会匹配换行符
print (mi)
ni = open('e:/pythontest/test'+str(page)+'.txt','w',encoding='utf-8') #encoding='utf-8' #在windows下面,新文件的默认编码是gbk,这样的话,python解释器会用gbk编码去解析我们的网络数据流txt,然而txt此时已经是decode过的unicode编码,这样的话就会导致解析不了,出现上述问题。 解决的办法就是,改变目标文件的编码:
for i in range(0,len(mi)):
ni.writelines(mi[i])
ni.close()
s = qiubai()
print('ok')
python BeautifulSoup 正则匹配糗事百科并保存到本地
最新推荐文章于 2023-12-09 16:31:55 发布