python3学习爬虫 正则以及url

#coding=utf8
__author__ = 'Administrator'

import os
import re
import urllib.request
import pymysql

class Spider:
    #页面初始化
    def __init__(self,url,retext,path):
        self.url = url
        self.path = path
        self.retext = retext

    def mkdir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        return path

    def getData(self):
        url = urllib.request.Request(self.url)
        html = urllib.request.urlopen(url).read()
        print(html)
        html = html.decode('utf-8','ignore')
        imgRe = re.compile(self.retext)
        data = imgRe.findall(html)
        return data
        #self.data = data

    def saveImg(self,imgurl,imgname):
        #img = urllib.request.urlopen(imgurl).read()
        #img = urllib.request.urlopen(imgurl)
        #print(img)
        path = self.path
        try:
            img = urllib.request.urlopen(imgurl)
            #print(img)
        except Exception as e:
            print(e)
        else:
            img = img.read()
            f = open("./%s/%s.jpg" %(path,imgname),'wb')
            f.write(img)
            f.close()

    def saveMysql(self,title,url,catogary,content):
        try:
            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
            cur=con.cursor()
            #cur.execute('select * from imgurl')
            #data=cur.fetchall()
            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
            #print(insert)
            cur.execute(insert)
            cur.close()#关闭游标
            con.close()#释放数据库资源
        except  Exception as e:
            print("发生异常:%s"%e)
    def getContent(self):
        path = self.mkdir(self.path)
        data = self.getData()
        #print(data)
        fp = open('./%s/url.txt'%(path),'w+')
        x = 0
        for d in data:
            print(d)
            fp.write(d)
            if (len(d) < 80) :
                #self.saveImg(d,x)
                #print(d)
                x+=1
        fp.close()


url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'



retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"赵丽颖")
spider.getContent()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值