前期准备
先把所需要的爬虫的工具包准备好 File -->>Setings -->> Project Interpreter //导入所需要的包,设置国内镜像
from bs4 import BeautifulSoup
import re
import urllib. request, urllib. error
import xlwt
from lxml import etree
import csv
爬取网页
分析目标网址的规律,例如体育彩票的大乐透开奖网址,每次往下翻页时只是 historykj_ 后面的数字在变化 -'http://www.lottery.gov.cn/historykj/history_%s.jspx?_ltype=dlt'%i
定义爬取网页的函数,head 伪装自己不让服务器发现你在爬取
def askURL ( url) :
head = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
request = urllib. request. Request( url, headers= head)
html = ""
try :
response = urllib. request. urlopen( request)
html = response. read( ) . decode( 'GBK' )
except urllib. error. URLError as e:
if hasattr ( e, "code" ) :
print ( e. code)
if hasattr ( e, "reason" ) :
print ( e. reason)
return html
解析网页
分析所取目标数据的存放位置,之后利用工具包里的方法对我们所需要的数据进行提取
def getData ( ) :
datalist = [ ]
for i in range ( 1 , 101 ) :
url = 'http://www.lottery.gov.cn/historykj/history_%s.jspx?_ltype=dlt' % i
html = askURL( url)
tree = etree. HTML( html)
tr_list = tree. xpath( '//div[@class="result"]/table/tbody/tr' )
td_list = tree. xpath( '//div[@class="result"]/table/tbody/tr/td' )
for tr_num in range ( 1 , len ( tr_list) + 1 ) :
qihao = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[1]//text()' % ( tr_num) )
red1 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[2]//text()' % ( tr_num) )
red2 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[3]//text()' % ( tr_num) )
red3 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[4]//text()' % ( tr_num) )
red4 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[5]//text()' % ( tr_num) )
red5 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[6]//text()' % ( tr_num) )
blue1 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[7]//text()' % ( tr_num) )
blue2 = tree. xpath( '//div[@class="result"]/table/tbody/tr[%s]/td[8]//text()' % ( tr_num) )
data = qihao + red1 + red2 + red3 + red4 + red5 + blue1 + blue2
datalist. append( data)
return datalist
上面的方法适合用与存储在表格中的数据,下面的例子主要说明一个div 封装一个整体的对象,例如影片信息
findlink = re. compile ( r'<a href="(.*?)">' )
findImag = re. compile ( r'<img.*src="(.*?)"' , re. S)
findTitle = re. compile ( r'<span class="title">(.*)</span>' )
findRating = re. compile ( r'<span class="rating_num" property="v:average">(.*)</span>' )
findJudge = re. compile ( r'<span>(\d*)人评价</span>' )
findInq = re. compile ( r'<span class="inq">(.*)</span>' )
findBd = re. compile ( r'<p class="">(.*?)</p>' , re. S)
def getData ( baseurl) :
datalist = [ ]
for i in range ( 0 , 10 ) :
url = baseurl + str ( i * 25 )
html = askURL( url)
soup = BeautifulSoup( html, "html.parser" )
for item in soup. find_all( 'div' , class_= "item" ) :
data= [ ]
item= str ( item)
link = re. findall( findlink, item) [ 0 ]
data. append( link)
imgSrc= re. findall( findImag, item) [ 0 ]
data. append( imgSrc)
titles = re. findall( findTitle, item)
if ( len ( titles) == 2 ) :
ctitle= titles[ 0 ]
data. append( ctitle)
otitle= titles[ 1 ] . replace( "/" , "" )
data. append( otitle)
else :
data. append( titles[ 0 ] )
data. append( ' ' )
rating = re. findall( findRating, item) [ 0 ]
data. append( rating)
judge = re. findall( findJudge, item) [ 0 ]
data. append( judge)
inq = re. findall( findInq, item)
if ( len ( inq) != 0 ) :
inq = inq[ 0 ] . replace( "。" , "" )
data. append( inq)
else :
data. append( " " )
bd = re. findall( findBd, item) [ 0 ]
bd = re. sub( '<br(\s+)?/>(\s+)?' , " " , bd)
bd = re. sub( '/' , " " , bd)
data. append( bd. strip( ) )
datalist. append( data)
return datalist
保存数据
def saveDate ( datalist, savepath) :
book = xlwt. Workbook( encoding= "utf-8" , style_compression= 0 )
sheet = book. add_sheet( '豆瓣top250' , cell_overwrite_ok= True )
col= ( "电影链接" , "图片链接" , "影片中文名" , "影片国外名" , "评分" , "评价人数" , "概况" , "相关信息" )
for i in range ( 0 , 8 ) :
sheet. write( 0 , i, col[ i] )
for i in range ( 0 , 250 ) :
print ( "第%d条" % ( i+ 1 ) )
data = datalist[ i]
for j in range ( 0 , 8 ) :
sheet. write( i+ 1 , j, data[ j] )
book. save( savepath)
最后你就会欣然的得到你所想要的数据啦,在数据解析的时候比较难处理,需要所花费些功夫学习正则表达式以及那些工具包。