极其简单的初级版爬虫,仅用作部分细节参考。
# -*- coding: utf-8 -*-
import urllib.request
import lxml.etree
import gzip
import json
import pymysql
import types
import time
import socket
socket.setdefaulttimeout(20)
conn = pymysql.connect(user='****', passwd='****',host='localhost',port=3306 , db='****',charset='utf8')
cur = conn.cursor()
cur.execute("SELECT `name` FROM `img` WHERE `index` = 1")
conn.commit()
t_one=cur.fetchone()[0]
cur.execute("SELECT max(name) FROM `img`")
conn.commit()
t_max=cur.fetchone()[0]
cur.execute("SELECT `ix` FROM `startpage` WHERE `index` = 1")
conn.commit()
startpage=cur.fetchone()[0]
if(int(t_max)>int(t_one)):
t_one=t_max
tt_one=[t_one]
cur.execute("UPDATE `img` SET `name` = %s WHERE `index` = 1",tt_one)
conn.commit()
imgname=int(t_one)
def getHTML(url):
failed=0
try:
u1=urllib.request.urlopen(url,timeout=15)
data=u1.read()
if data.startswith(b'\x1f\x8b'):
html=gzip.decompress(data).decode('utf-8')
else:
html=data.decode('utf-8')
u1.close()
return html
except:
failed+=1
if failed == 4:
return ''
getHTML(url)
def writeDB(c,s,text,pagenumber,title,sign):
print("语句:"+sign)
i=0
try:
for each in c:
t=s.xpath(text)[i]
stmp=t.xpath('string(.)').strip()
stmp.replace('\n','').replace(' ','')
i+=1
if stmp != '' and ("链接" not in stmp):
arg=[pagenumber,title,stmp]
cur.execute("INSERT INTO cont(belongpage,title,content) VALUES(%s,%s,%s)",arg)
print(stmp)
print('\n')
conn.commit()
print('|'*47+"抓取完成"+'|'*46)
except:
print('出错')
def getContent(url):
global imgname
pagenumber=url.split('/')[4]
pagenumber=pagenumber.split('.')[0]
html=getHTML(url)
print("已下载到网页")
for ty in range(50):
try:
s=lxml.etree.HTML(html)
break
except:
print('第'+str(ty+1)+'次尝试……')
html=getHTML(url)
pass
print("正在解析图片")
imgs=s.xpath('//*[@id="content"]/div[1]/p/descendant-or-self::img/attribute::src')
imgx=s.xpath('//*[@id="content"]/div[1]/div/img/descendant-or-self::img/attribute::src')
imgy=s.xpath('//*[@id="content"]/div[1]/div/div/img/descendant-or-self::img/attribute::src')
imgv=s.xpath('//*[@id="content"]/div[1]/div/a/img/descendant-or-self::img/attribute::src')
imgu=s.xpath('//*[@id="content"]/div[1]/div/p/img/descendant-or-self::img/attribute::src')
imgt=s.xpath('//*[@id="content"]/div[1]/div[2]/div/ul/li/div/img/descendant-or-self::img/attribute::src')
imgp=s.xpath('//*[@id="content"]/div[1]/ul/li/div/img/descendant-or-self::img/attribute::src')
imgn=s.xpath('//*[@id="content"]/div[1]/ul/p/a/img/descendant-or-self::img/attribute::src')
imgw=s.xpath('//*[@id="content"]/div[1]/div/div/div/img/descendant-or-self::img/attribute::src')
imgz=s.xpath('//*[@id="content"]/div[1]/div/table/tbody/tr/td/img/descendant-or-self::img/attribute::src')
imgo=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/p/img/descendant-or-self::img/attribute::src')
imgr=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div/img/descendant-or-self::img/attribute::src')
imgq=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div/p/img/descendant-or-self::img/attribute::src')
imgs=imgs+imgx+imgy+imgz+imgw+imgv+imgu+imgt+imgr+imgq+imgp+imgo+imgn
local="E://SpiderTest//_img//"
print("正在下载图片")
for eachimg in imgs:
imgname+=1
sr=str(imgname)
print("正在下第"+sr+"张图片…:"+eachimg,end='')
for ty in range(50):
try:
urllib.request.urlretrieve(eachimg,local+sr+'.jpg')
break
except:
print('第'+str(ty+1)+'次尝试……')
pass
print("…已完成。")
arg=[pagenumber,sr]
cur.execute("INSERT INTO img(belongpage,name) VALUES(%s,%s)",arg)
conn.commit()
print("图片下载完成")
c1=s.xpath('//*[@id="content"]/div[1]/p')
c2=s.xpath('//*[@id="content"]/div[1]/div[2]')
c3=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div')
c4=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td')
title=s.xpath('//*[@id="content"]/div[1]/h1/a/text()')
tag=s.xpath('//*[@id="content"]/div[1]/div[1]/a[1]/text()')
date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[3]/text()')
if "20" not in date[0]:
date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[4]/text()')
if "20" not in date[0]:
date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[5]/text()')
arg=[pagenumber,tag,date,imgname,title]
cur.execute("INSERT INTO main(pagenumber,tag,date,imgname,title) VALUES(%s,%s,%s,%s,%s)",arg)
conn.commit()
print('Title: '+title[0]+' '+'Tags: '+tag[0]+' '+'Time: '+date[0]+' '+'Page: '+pagenumber+'\n')
writeDB(c1,s,'//*[@id="content"]/div[1]/p',pagenumber,title,"1")
writeDB(c2,s,'//*[@id="content"]/div[1]/div[2]',pagenumber,title,"2")
writeDB(c3,s,'//*[@id="content"]/div[1]/table/tbody/tr/td/div',pagenumber,title,"3")
writeDB(c4,s,'//*[@id="content"]/div[1]/table/tbody/tr/td',pagenumber,title,"4")
time.sleep(12)
for k in range(startpage,6):
url1="http://***************/page/"+str(k)
html=getHTML(url1)
for ty in range(50):
try:
s=lxml.etree.HTML(html)
break
except:
print('第'+str(ty+1)+'次尝试……')
html=getHTML(url1)
pass
c=s.xpath('//*[@id="content"]/div/div[2]/h2/a')
print('-'*44+"已抓取到第"+str(k)+"页"+'-'*43)
sarg=[k]
cur.execute("UPDATE `startpage` SET `ix` = %s WHERE `index` = 1",sarg)
conn.commit()
for each in c:
url2="".join(each.xpath('attribute::href'))
if url2.split('/')[2].split('.')[0] == "******":
t_pagenumber=url2.split('/')[4]
t_pagenumber=t_pagenumber.split('.')[0]
if t_pagenumber == '22766' or t_pagenumber == '22888':
continue
tt_pagenumber=[t_pagenumber]
cur.execute("SELECT `pagenumber` FROM `main` WHERE `pagenumber` = %s",tt_pagenumber)
conn.commit()
print('='*100)
print("尝试抓取: "+url2)
if cur.fetchone() == None:
getContent(url2)
else:
print("已存在")
arg=[imgname]
cur.execute("UPDATE `img` SET `name` = %s WHERE `index` = 1",arg)
conn.commit()
cur.close()
conn.close()