自制 Python3爬虫

极其简单的初级版爬虫,仅用作部分细节参考。

# -*- coding: utf-8 -*-
import urllib.request
import lxml.etree
import gzip
import json
import pymysql
import types
import time

import socket
socket.setdefaulttimeout(20)
conn = pymysql.connect(user='****', passwd='****',host='localhost',port=3306 , db='****',charset='utf8')
cur = conn.cursor()
cur.execute("SELECT `name` FROM `img` WHERE `index` = 1")
conn.commit()
t_one=cur.fetchone()[0]
cur.execute("SELECT max(name) FROM `img`")
conn.commit()
t_max=cur.fetchone()[0]
cur.execute("SELECT `ix` FROM `startpage` WHERE `index` = 1")
conn.commit()
startpage=cur.fetchone()[0]
if(int(t_max)>int(t_one)):
    t_one=t_max
    tt_one=[t_one]
    cur.execute("UPDATE `img` SET `name` = %s WHERE `index` = 1",tt_one)
    conn.commit()
imgname=int(t_one)
def getHTML(url):
    failed=0
    try:
        u1=urllib.request.urlopen(url,timeout=15)
        data=u1.read()
        if data.startswith(b'\x1f\x8b'):
            html=gzip.decompress(data).decode('utf-8')
        else:
            html=data.decode('utf-8')
        u1.close()
        return html
    except:
        failed+=1
        if failed == 4:
            return ''
        getHTML(url)
def writeDB(c,s,text,pagenumber,title,sign):
    print("语句:"+sign)
    i=0
    try:
        for each in c:
            t=s.xpath(text)[i]
            stmp=t.xpath('string(.)').strip()
            stmp.replace('\n','').replace(' ','')
            i+=1
            if stmp != '' and ("链接" not in stmp):
                arg=[pagenumber,title,stmp]
                cur.execute("INSERT INTO cont(belongpage,title,content) VALUES(%s,%s,%s)",arg)
                print(stmp)
                print('\n')
        conn.commit()
        print('|'*47+"抓取完成"+'|'*46)
    except:
        print('出错')
def getContent(url):
    global imgname
    pagenumber=url.split('/')[4]
    pagenumber=pagenumber.split('.')[0]
    html=getHTML(url)
    print("已下载到网页")
    for ty in range(50):
        try:
            s=lxml.etree.HTML(html)
            break
        except:
            print('第'+str(ty+1)+'次尝试……')
            html=getHTML(url)
            pass
    print("正在解析图片")
    imgs=s.xpath('//*[@id="content"]/div[1]/p/descendant-or-self::img/attribute::src')
    imgx=s.xpath('//*[@id="content"]/div[1]/div/img/descendant-or-self::img/attribute::src')
    imgy=s.xpath('//*[@id="content"]/div[1]/div/div/img/descendant-or-self::img/attribute::src')
    imgv=s.xpath('//*[@id="content"]/div[1]/div/a/img/descendant-or-self::img/attribute::src')
    imgu=s.xpath('//*[@id="content"]/div[1]/div/p/img/descendant-or-self::img/attribute::src')
    imgt=s.xpath('//*[@id="content"]/div[1]/div[2]/div/ul/li/div/img/descendant-or-self::img/attribute::src')
    imgp=s.xpath('//*[@id="content"]/div[1]/ul/li/div/img/descendant-or-self::img/attribute::src')
    imgn=s.xpath('//*[@id="content"]/div[1]/ul/p/a/img/descendant-or-self::img/attribute::src')
    imgw=s.xpath('//*[@id="content"]/div[1]/div/div/div/img/descendant-or-self::img/attribute::src')
    imgz=s.xpath('//*[@id="content"]/div[1]/div/table/tbody/tr/td/img/descendant-or-self::img/attribute::src')
    imgo=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/p/img/descendant-or-self::img/attribute::src')
    imgr=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div/img/descendant-or-self::img/attribute::src')
    imgq=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div/p/img/descendant-or-self::img/attribute::src')
    imgs=imgs+imgx+imgy+imgz+imgw+imgv+imgu+imgt+imgr+imgq+imgp+imgo+imgn
    local="E://SpiderTest//_img//"
    print("正在下载图片")
    for eachimg in imgs:
        imgname+=1
        sr=str(imgname)
        print("正在下第"+sr+"张图片…:"+eachimg,end='')
        for ty in range(50):
            try:
                urllib.request.urlretrieve(eachimg,local+sr+'.jpg')
                break
            except:
                print('第'+str(ty+1)+'次尝试……')
                pass
        print("…已完成。")
        arg=[pagenumber,sr]
        cur.execute("INSERT INTO img(belongpage,name) VALUES(%s,%s)",arg)
    conn.commit()
    print("图片下载完成")
    c1=s.xpath('//*[@id="content"]/div[1]/p')
    c2=s.xpath('//*[@id="content"]/div[1]/div[2]')
    c3=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td/div')
    c4=s.xpath('//*[@id="content"]/div[1]/table/tbody/tr/td')
    title=s.xpath('//*[@id="content"]/div[1]/h1/a/text()')
    tag=s.xpath('//*[@id="content"]/div[1]/div[1]/a[1]/text()')
    date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[3]/text()')
    if "20" not in date[0]:
        date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[4]/text()')
        if "20" not in date[0]:
            date=s.xpath('//*[@id="content"]/div[1]/div[1]/a[5]/text()')
    arg=[pagenumber,tag,date,imgname,title]
    cur.execute("INSERT INTO main(pagenumber,tag,date,imgname,title) VALUES(%s,%s,%s,%s,%s)",arg)
    conn.commit()
    print('Title: '+title[0]+' '+'Tags: '+tag[0]+' '+'Time: '+date[0]+' '+'Page: '+pagenumber+'\n')
    writeDB(c1,s,'//*[@id="content"]/div[1]/p',pagenumber,title,"1")
    writeDB(c2,s,'//*[@id="content"]/div[1]/div[2]',pagenumber,title,"2")
    writeDB(c3,s,'//*[@id="content"]/div[1]/table/tbody/tr/td/div',pagenumber,title,"3")
    writeDB(c4,s,'//*[@id="content"]/div[1]/table/tbody/tr/td',pagenumber,title,"4")
    time.sleep(12)
for k in range(startpage,6):
    url1="http://***************/page/"+str(k)
    html=getHTML(url1)
    for ty in range(50):
        try:
            s=lxml.etree.HTML(html)
            break
        except:
            print('第'+str(ty+1)+'次尝试……')
            html=getHTML(url1)
            pass
    c=s.xpath('//*[@id="content"]/div/div[2]/h2/a')
    print('-'*44+"已抓取到第"+str(k)+"页"+'-'*43)
    sarg=[k]
    cur.execute("UPDATE `startpage` SET `ix` = %s WHERE `index` = 1",sarg)
    conn.commit()
    for each in c:
        url2="".join(each.xpath('attribute::href'))
        if url2.split('/')[2].split('.')[0] == "******":
            t_pagenumber=url2.split('/')[4]
            t_pagenumber=t_pagenumber.split('.')[0]
            if t_pagenumber == '22766' or t_pagenumber == '22888':
                continue
            tt_pagenumber=[t_pagenumber]
            cur.execute("SELECT `pagenumber` FROM `main` WHERE `pagenumber` = %s",tt_pagenumber)
            conn.commit()
            print('='*100)
            print("尝试抓取: "+url2)
            if cur.fetchone() == None:
                getContent(url2)
            else:
                print("已存在")
arg=[imgname]
cur.execute("UPDATE `img` SET `name` = %s WHERE `index` = 1",arg)
conn.commit()
cur.close()
conn.close()


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值