# Python 爬虫爬取商品信息和下载图片


  1. 源码如下:创建文件夹D:\pic
import re
import urllib.request
import requests
from lxml import etree
import pandas as pd
from pandas import DataFrame

#设置请求头部
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}

r = requests.get("https://home.meishichina.com/search/可乐鸡翅/page/1/", headers=headers)
print(r.content)

a = []
with open('one.txt', 'a+') as write:
    write.seek(0)
    a.append(r.content)
    for line in a:
        write.write(str(line + b'\n'))



url = "https://home.meishichina.com/search/可乐鸡翅/page/1/"
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
root = etree.HTML(res.text)
name = root.xpath('//li/div[@class="detail"]/p/text()')

for i in range(0, len(name)):
    name[i] = re.sub('\s', '', name[i])
    print(name[i])

# 商家名称
shopname = root.xpath('//li/div[@class="detail"]//div[@class="substatus clear"]//div[@class="left"]/a/text()')
for i1 in range(0, len(shopname)):
    shopname[i1] = re.sub('\s', '', shopname[i1])
    print(shopname[i1])

# 产品名字
productname = root.xpath('//li/div[@class="detail"]/h4/a/em/text()')
for i2 in range(0, len(productname)):
    productname[i2] = re.sub('\s', '', productname[i2])
    print(productname[i2])

# 产品图片
pic = root.xpath('//li/div[@class="pic"]/a/img/@data-src')
picname=[]
for i3 in range(0, len(pic)):
    pic[i3] = re.sub('\s', '', pic[i3])
    print(pic[i3])
    picname.append("D://pic"+str(i3)+'.jpg')
    with open('D://pic' + '/' + str(i3) + '.jpg', 'wb') as writer:
        req = urllib.request.urlopen("https:" + pic[i3])
        buf = req.read()
        writer.write(buf)
writer.close()


  1. 定时爬取并且保存到数据库中
import re
import urllib.request
import requests
from lxml import etree
import pandas as pd
from pandas import DataFrame
import Test_05.connect as co
import datetime
import time


def doSth():
    print('这个爬虫程序要开始疯狂爬取了')
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    r = requests.get("https://home.meishichina.com/search/可乐鸡翅/page/1/", headers=headers)
    print(r.content)
    a = []
    with open('one.txt', 'a+') as writer:
        writer.seek(0)
        a.append(r.content)
        for line in a:
            writer.write(str(line + b'\n'))
    writer.close()

    # jdInfoAll = DataFrame()
    # for i in range(1, 4):

    url = "https://home.meishichina.com/search/可乐鸡翅/page/1/"
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    root = etree.HTML(res.text)

    name = root.xpath('//li/div[@class="detail"]/p/text()')
    for i in range(0, len(name)):
        name[i] = re.sub('\s', '', name[i])
        print(name[i])

    # 商家名称
    shopname = root.xpath('//li/div[@class="detail"]//div[@class="substatus clear"]//div[@class="left"]/a/text()')
    for i1 in range(0, len(shopname)):
        shopname[i1] = re.sub('\s', '', shopname[i1])
        print(shopname[i1])

    # 产品名字
    productname = root.xpath('//li/div[@class="detail"]/h4/a/em/text()')
    for i2 in range(0, len(productname)):
        productname[i2] = re.sub('\s', '', productname[i2])
        print(productname[i2])

    # 产品图片
    pic = root.xpath('//li/div[@class="pic"]/a/img/@data-src')
    picname = []
    for i3 in range(0, len(pic)):
        pic[i3] = re.sub('\s', '', pic[i3])
        print(pic[i3])
        picname.append("D://pic" + str(i3) + '.jpg')
        with open('D://pic' + '/' + str(i3) + '.jpg', 'wb') as writer:
            req = urllib.request.urlopen("https:" + pic[i3])
            buf = req.read()
            writer.write(buf)
            writer.close()

    # 调用连接数据库的函数
    conn, cur = co.conn_db()

    for i4 in range(0, len(name)-1):
        if productname[i4] is not None and name[i4] is not None and picname[i4] is not None and shopname[i4] is not None:
            sta = co.exe_update(cur,"insert into product(name, yuanliao, pic,merchants) ""values('%s','%s','%s','%s')" % (productname[i4], name[i4], picname[i4], shopname[i4]))
            if sta == 1:
                print('插入成功')
            else:
                print('插入失败')
            co.exe_commit(cur)  # 注意!! 一定要记得commit,否则操作成功了,但是并没有添加到数据库中
        else:
            break;
    co.conn_close(conn, cur)


def main(h=21, m=7):
    while True:
        now = datetime.datetime.now()
        print(now)
        if now.hour == h and now.minute == m:
            doSth()
        # 每隔60秒检测一次
        time.sleep(60)


main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

全栈程序员

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值