爬取的信息包括:公司名称,产品url,产品名称,产品类别,产品详细信息
1. 翻页模版
import os
from lxml import etree
from baseUtils import *
from hashlib import md5
import datetime
_companyName = os.path.basename(__file__).replace(".py", "")
timeNow = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
_result = []
def getGoodsUrls():
goodsUrls = []
pages = [f"http://www.ahhtdq.com/info/30816.html?page={_}" for _ in range(1, 5)]
for page in pages:
response = request(page)
root = etree.HTML(response.text)
goodsUrls += ["http://www.ahhtdq.com"+_ for _ in
list(set(root.xpath("//li[@class='iFCB-PRO-5 clearfix clone']//a/@href")))]
# print(goodsUrls)
return goodsUrls
def saveContent(_url):
response = request(_url)
record ={}
root = etree.HTML(response.text)
_name = getContent(root.xpath("//div[@class='display_title']/h1/text()")).replace("\r", "").replace("\n", "").replace("\t", "")
_class = getContent(
root.xpath("//span[@class='ico']/a[3]/text()")).replace("\r", "").replace("\n", "").replace("\t", "")
detailInfo = repr("".join(root.xpath("//div[@id='info_content']//text()"))).replace("\\r", "").replace(
"\\t","").replace(" ", "").replace("\\n", "").replace("\\xa0", "").replace("\'", "").replace("\\u3000", "").strip()
record["goodsUrl"] = _url
record["campany_name"] = _companyName
record["category"] = _class
record["goodsName"] = _name
record["detailInfo"] = detailInfo
record["System_update_time"] = timeNow
_id = md5(_url.encode()).hexdigest()
print(record)
_result.append((_id,record))
def start():
goodsUrls = getGoodsUrls()
pool = threadpool.ThreadPool(5)
tasks = threadpool.makeRequests(saveContent, goodsUrls)
[pool.putRequest(task) for task in tasks]
pool.wait()
write2ES(_result)
if __name__ == '__main__':
start()
翻页模版是最容易爬取的一种情况
2. 菜单模版
import json
import re
from lxml import etree
from baseUtils import *
from hashlib import md5
import datetime
import os
_companyName = os.path.basename(__file__).replace(".py","")
timeNow = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
_result = []
def getGoodsUrls():
goodsUrls = []
index = "http://www.msensor.com.cn/Products111.html"
response = request(index)
root = etree.HTML(response.text)
_class = ["".join(_).strip() for _ in root.xpath("//ul[@class='navUl']/li[3]//li//a/text()")]
_url = ["http://www.msensor.com.cn" + _ for _ in root.xpath("//ul[@class='navUl']/li[3]//li//a/@href")]
menu = dict(zip(_class, _url))
# print(_class)
# print(_url)
# print(menu)
for _class in menu:
response = request(menu[_class])
root = etree.HTML(response.text)
pages = [int(_) for _ in root.xpath("//div[@class='pageNum']/text()")]
pages.append(1)
pages = max(pages)
requestParam = json.loads(getContent(re.findall('requestParam:(\{.*?\})', response.text)))
requestApi = "http://www.msensor.com.cn" + getContent(
re.findall('url: \"(/comp/portalResProduct/list\.do\?compId=.*?)\"', response.text))
for page in range(1,pages+1):
requestParam['currentPage'] = page
response = request(requestApi, params=requestParam)
root = etree.HTML(response.text)
goodsUrls += [[_class, "http://www.msensor.com.cn" + _] for _ in
list(set(root.xpath("//div[@class='e_box p_images p_imagesMo']/a/@href")))]
return goodsUrls
def saveContent(tp):
_class = tp[0]
_url = tp[1]
response = request(_url)
record ={}
root = etree.HTML(response.text)
_name = "".join(root.xpath("//h1[@class='e_title d_Title p_Title h2']//text()")).strip()
detailInfo = repr("".join(root.xpath("//div[@class='e_box d_DescribeContent p_DescribeContent']//p//text()"))).replace("\\r", "").replace(
"\\t","").replace(" ", "").replace("\\n", "").replace("\\xa0", "").replace("\\u3000", "").replace("\'", "").replace("返回","").strip()
record["goodsUrl"] = _url
record["campany_name"] = _companyName
record["category"] = _class
record["goodsName"] = _name
record["detailInfo"] = detailInfo
record["System_update_time"] = timeNow
_id = md5(_url.encode()).hexdigest()
# print(record)
_result.append((_id,record))
def start():
goodsUrls = getGoodsUrls()
# print(len(goodsUrls))
pool = threadpool.ThreadPool(5)
tasks = threadpool.makeRequests(saveContent, goodsUrls)
[pool.putRequest(task) for task in tasks]
pool.wait()
write2ES(_result)
if __name__ == '__main__':
start()
3. list.do模版
import re
from lxml import etree
from baseUtils import *
from hashlib import md5
import datetime
import os
_companyName = os.path.basename(__file__).replace(".py","")
timeNow = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
_result = []
def getGoodsUrls():
goodsUrls = []
index = "http://www.jshhby.com/product/5/"
response = request(index)
root = etree.HTML(response.text)
_class = ["".join(_).strip() for _ in root.xpath("//h3[@class='yjTitle link']/a//text()")]
_url = ["http://www.jshhby.com" + _ for _ in root.xpath("//h3[@class='yjTitle link']/a/@href")]
menu = dict(zip(_class, _url))
print(_class)
print(_url)
print(menu)
# for _class in menu:
# response = request(menu[_class])
# root = etree.HTML(response.text)
# pages = [int(_) for _ in root.xpath("//div[@class='pageNum']/text()")]
# pages.append(1)
# pages = max(pages)
# requestParam = json.loads(getContent(re.findall('requestParam:(\{.*?\})', response.text)))
# requestApi = "http://www.jshhby.com" + getContent(
# re.findall('url: \"(/comp/portalResProduct/list\.do\?compId=.*?)\"', response.text))
# for page in range(1,pages+1):
# requestParam['currentPage'] = page
# response = request(requestApi, params=requestParam)
# root = etree.HTML(response.text)
# goodsUrls += [[_class, "http://www.jshhby.com" + _] for _ in
# list(set(root.xpath("//div[@class='e_box p_Product']//a/@href")))]
# return goodsUrls
def saveContent(tp):
_class = tp[0]
_url = tp[1]
response = request(_url)
record ={}
root = etree.HTML(response.text)
_name = "".join(root.xpath("//h1[@class='e_title d_Title p_Title h2']//text()")).strip()
detailInfo = repr("".join(root.xpath("//div[@class='pro_content']//p//text()"))).replace("\\r", "").replace(
"\\t","").replace(" ", "").replace("\\n", "").replace("\\xa0", "").replace("\\u3000", "").replace("\'", "").replace("返回","").strip()
record["goodsUrl"] = _url
record["campany_name"] = _companyName
record["category"] = _class
record["goodsName"] = _name
record["detailInfo"] = detailInfo
record["System_update_time"] = timeNow
_id = md5(_url.encode()).hexdigest()
print(record)
_result.append((_id,record))
def start():
goodsUrls = getGoodsUrls()
# print(len(goodsUrls))
# pool = threadpool.ThreadPool(5)
# tasks = threadpool.makeRequests(saveContent, goodsUrls)
# [pool.putRequest(task) for task in tasks]
# pool.wait()
# write2ES(_result)
if __name__ == '__main__':
start()
封装模块baseUtils.py:
import requests
import threadpool
from elasticsearch import Elasticsearch, helpers
from retrying import retry
import urllib3
urllib3.disable_warnings()
_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
}
_proxies = {
'http': 'http://192.168.100.210:3128',
'https': 'http://192.168.100.210:3128',
}
@retry(stop_max_attempt_number=20)
def request(url,method="get", header=None, proxies=None, verify=False, _data=None, params=None):
if proxies is None:
proxies = _proxies
if header is None:
header = _headers
if method=="get":
response = requests.get(url,headers=header, proxies=proxies,params=params,verify=verify)
response.encoding = response.apparent_encoding
return response
elif method=="post":
response = requests.post(url,headers=header, proxies=proxies,data=_data,params=params,verify=verify)
response.encoding = response.apparent_encoding
return response
def getContent(item,default=""):
return item[0] if item else default
def write2ES(tps):
es_config = Elasticsearch(hosts=['http://192.168.100.230:9200'], http_auth=None, timeout=60)
Esaction = []
for _id,record in tps:
Esaction.append({
"_op_type": 'update',
"_index": 'crawler_company_product',
"_type": 'index',
"_id": _id,
"_source": {
"doc": record,
"doc_as_upsert": True
}
})
if len(Esaction) >= 1000:
helpers.bulk(es_config, Esaction)
Esaction = []
helpers.bulk(es_config, Esaction)