python爬取web漫画网站_python爬取漫画

#-*- coding: utf-8 -*-

importreimporturllibimporturllib2importosimportstatimportitertoolsimportreimportsysimportrequestsimportjsonimporttimeimportsocketimporturlparseimportcsvimportrandomfrom datetime importdatetime, timedeltaimportlxml.htmlfrom zipfile importZipFilefrom StringIO importStringIOfrom downloader importDownloaderfrom bs4 importBeautifulSoupfrom HTMLParser importHTMLParserfrom itertools importproductimportsys

reload(sys)

sys.setdefaultencoding('utf8')

URL= 'http://comic.sfacg.com'picture= 'http://coldpic.sfacg.com'

classCartoon():

url=None

name=Nonedef download(url, user_agent='wswp', num_try=2):

headers= {'User_agent': user_agent}

request= urllib2.Request(url, headers=headers)try:

html=urllib2.urlopen(request).read()excepturllib2.URLError as e:print 'Download error', e.reason

html=Noneif num_try >0:if hasattr(e, 'code') and 500 <= e.code < 600:return download(url, user_agent, num_try - 1)elif e.code == 403:returnNonereturnhtmldefget_section_url(url):

html=download(url)if html ==None:returnNone

soup= BeautifulSoup(html, "html.parser")

results= soup.find_all(name='ul', attrs={'class': 'serialise_list Blue_link2'})

res= r'([\S\s]*?)'links= re.findall(res, str(results),re.S |re.M)returnlinksdefget_section_page(url):

html=download(url)if html ==None:returnNone

soup= BeautifulSoup(html, "html.parser")

results= soup.find_all(name='script', attrs={'type': 'text/javascript'})

tt=len(results)

js= results[tt-1]

mm= js.get('src')if mm ==None:

result= soup.find_all(name='script', attrs={'language': 'javascript'})

js1= result[1]

mm= js1.get('src')

html1= download(URL+mm)

list= html1.split(';')

List=[]for each inlist:if 'picAy[' ineach:

src= each.split('=')

List.append(picture+src[1][2:-1])returnListdefdownload_cartoon(url, cartoon_name,Section,num):

path= "自己定义的路径"+cartoon_nameif notos.path.exists(path):

os.mkdir(path)

path= path + "/"+Sectionif notos.path.exists(path):

os.mkdir(path)

content=requests.get(url).content

with open(path+ '/' + str(num) + '.jpg', 'wb') as f:

f.write(content)print "Downloading cartoon_name" + path + str(num)+ "下载完成"f.close()if __name__ == '__main__':

cartoon_list=[]

html=download(URL)

tree=lxml.html.fromstring(html)

results= tree.cssselect('tr > td > a')for each inresults:

ti= each.get('href')if '/mh/' in ti or '/HTML/' inti:if each.text_content() != "":

cartoon=Cartoon()

cartoon.url= each.get('href')

cartoon.name= each.text_content().replace(' ','')

cartoon_list.append(cartoon)for each incartoon_list:printeach.urlprinteach.name

links=get_section_url(each.url)

links=list(reversed(links))

section=0for link inlinks:

ul= URL +link[0]

List=[]

List=get_section_page(ul)

section= section + 1Section= r'第'+ str(section) + r'章'num= 1

for mm inList:#print mm

download_cartoon(mm,each.name,Section,num)

num= num + 1

print each.name + Section + "下载完成"+str(num-1)+"张"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值