bs4的使用
要求:
要爬取的网址:http://manhua.dmzj.com/update_1.shtml
需要Beautiful Soup库
需要urllib2库
- 使用Beautiful soup
from bs4 import BeautifulSoup
- 爬取网页的内容
newurl='http://manhua.dmzj.com/update_%d.shtml' request=urllib2.Request(newurl) response=urllib2.urlopen(request) content =response.read()
- 使用find_all语句
get_text是按文本格式输出newurl='http://manhua.dmzj.com/update_%d.shtml' request=urllib2.Request(newurl) response=urllib2.urlopen(request) content =response.read() soup=BeautifulSoup(content) text=soup.find_all('div',class_='boxdiv1') for k in text: a=(k.get_text()) print a
- 查找<a>标签里面是否有链接和图片网址
for link in k.find_all("a"): if 'href' in link.attrs: url="url:"+"http://manhua.dmzj.com"+link.attrs['href'] print url for link in k.find_all("img"): if 'src' in link.attrs: img="img:"+link.attrs['src'] print img
- 整体输出并写入文本文档
list=a+url+"\n"+img print list f1 = open('03.txt','a') f1.write(list.encode('utf-8')) f1.close()
- 实现爬取多页
for i in range(1,7): newurl='http://manhua.dmzj.com/update_%d.shtml'%i
整体的代码:
__author__ = 'kkk'
#--*--coding:utf-8--*--
from bs4 import BeautifulSoup
import urllib2
url='http://manhua.dmzj.com/update_1.shtml'
for i in range(1,7):
newurl='http://manhua.dmzj.com/update_%d.shtml'%i
request=urllib2.Request(newurl)
response=urllib2.urlopen(request)
content =response.read()
soup=BeautifulSoup(content)
text=soup.find_all('div',class_='boxdiv1')
for k in text:
a=(k.get_text())
print a
for link in k.find_all("a"):
if 'href' in link.attrs:
url="url:"+"http://manhua.dmzj.com"+link.attrs['href']
print url
for link in k.find_all("img"):
if 'src' in link.attrs:
img="img:"+link.attrs['src']
print img
list=a+url+"\n"+img
print list
f1 = open('03.txt','a')
f1.write(list.encode('utf-8'))
f1.close()