- 源码如下:创建文件夹D:\pic
import re
import urllib.request
import requests
from lxml import etree
import pandas as pd
from pandas import DataFrame
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
r = requests.get("https://home.meishichina.com/search/可乐鸡翅/page/1/", headers=headers)
print(r.content)
a = []
with open('one.txt', 'a+') as write:
write.seek(0)
a.append(r.content)
for line in a:
write.write(str(line + b'\n'))
url = "https://home.meishichina.com/search/可乐鸡翅/page/1/"
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
root = etree.HTML(res.text)
name = root.xpath('//li/div[@class="detail"]/p/text()')
for i in range(0, len(name)):
name[i] = re.sub('\s', '', name[i])
print(name[i])
shopname = root.xpath('//li/div[@class="detail"]//div[@class="substatus clear"]//div[@class="left"]/a/text()')
for i1 in range(0, len(shopname)):
shopname[i1] = re.sub('\s', '', shopname[i1])
print(shopname[i1])
productname = root.xpath('//li/div[@class="detail"]/h4/a/em/text()')
for i2 in range(0, len(productname)):
productname[i2] = re.sub('\s', '', productname[i2])
print(productname[i2])
pic = root.xpath('//li/div[@class="pic"]/a/img/@data-src')
picname=[]
for i3 in range(0, len(pic)):
pic[i3] = re.sub('\s', '', pic[i3])
print(pic[i3])
picname.append("D://pic"+str(i3)+'.jpg')
with open('D://pic' + '/' + str(i3) + '.jpg', 'wb') as writer:
req = urllib.request.urlopen("https:" + pic[i3])
buf = req.read()
writer.write(buf)
writer.close()
- 定时爬取并且保存到数据库中
import re
import urllib.request
import requests
from lxml import etree
import pandas as pd
from pandas import DataFrame
import Test_05.connect as co
import datetime
import time
def doSth():
print('这个爬虫程序要开始疯狂爬取了')
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
r = requests.get("https://home.meishichina.com/search/可乐鸡翅/page/1/", headers=headers)
print(r.content)
a = []
with open('one.txt', 'a+') as writer:
writer.seek(0)
a.append(r.content)
for line in a:
writer.write(str(line + b'\n'))
writer.close()
url = "https://home.meishichina.com/search/可乐鸡翅/page/1/"
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
root = etree.HTML(res.text)
name = root.xpath('//li/div[@class="detail"]/p/text()')
for i in range(0, len(name)):
name[i] = re.sub('\s', '', name[i])
print(name[i])
shopname = root.xpath('//li/div[@class="detail"]//div[@class="substatus clear"]//div[@class="left"]/a/text()')
for i1 in range(0, len(shopname)):
shopname[i1] = re.sub('\s', '', shopname[i1])
print(shopname[i1])
productname = root.xpath('//li/div[@class="detail"]/h4/a/em/text()')
for i2 in range(0, len(productname)):
productname[i2] = re.sub('\s', '', productname[i2])
print(productname[i2])
pic = root.xpath('//li/div[@class="pic"]/a/img/@data-src')
picname = []
for i3 in range(0, len(pic)):
pic[i3] = re.sub('\s', '', pic[i3])
print(pic[i3])
picname.append("D://pic" + str(i3) + '.jpg')
with open('D://pic' + '/' + str(i3) + '.jpg', 'wb') as writer:
req = urllib.request.urlopen("https:" + pic[i3])
buf = req.read()
writer.write(buf)
writer.close()
conn, cur = co.conn_db()
for i4 in range(0, len(name)-1):
if productname[i4] is not None and name[i4] is not None and picname[i4] is not None and shopname[i4] is not None:
sta = co.exe_update(cur,"insert into product(name, yuanliao, pic,merchants) ""values('%s','%s','%s','%s')" % (productname[i4], name[i4], picname[i4], shopname[i4]))
if sta == 1:
print('插入成功')
else:
print('插入失败')
co.exe_commit(cur)
else:
break;
co.conn_close(conn, cur)
def main(h=21, m=7):
while True:
now = datetime.datetime.now()
print(now)
if now.hour == h and now.minute == m:
doSth()
time.sleep(60)
main()