这个单线程小爬虫以携程为例,爬取携程热门游,的一些简单信息
from lxml import etree
from bs4 import BeautifulSoup
import time
class XieCheng(object):
# 添加头部信息 和我们的起始url
def __init__(self):
self.url = 'http://vacations.ctrip.com/'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
# 发送请求
def send_request(self,link):
Response_html = requests.get(url=link).text
# time.sleep(2)
return Response_html
# 解析热门城市的link
def parse_place_link(self,ParseHtml):
HtmlObj = re.compile(r' <dt>热门目的地旅游</dt>(.*?)<a target="_blank" href="/tours">更多目的地</a>',re.S)
HotPlaceNode = HtmlObj.findall(ParseHtml)[0]
HotPlaceLink = re.findall(r'href="(.*?)"',HotPlaceNode)
for link in HotPlaceLink:
link = 'http://vacations.ctrip.com' + str(link)
yield link
# 解析详情页link
def parse_detail_page_link(self,ArticleHtml):
EveryonePageLinks = re.findall(r'<h2 class="product_title"><a href="(.*?)"',ArticleHtml)
for EveryonePageLink in EveryonePageLinks:
EveryonePageLink = "http:" + str(EveryonePageLink)
yield EveryonePageLink
# 解析详情信息
def parse_page_info(self,Response_html):
try: # 在这里我们捕获一下异常
soup = BeautifulSoup(Response_html, 'lxml')
if 'product_feature' in Response_html:
product_feature = soup.select('.product_feature')[0].get_text().replace('\n', '').replace('\t', '')
else:
product_feature = ''
detailed = ','.join(re.findall(r'[\u4e00-\u9fa50-9]+',re.findall(r'<!--详细行程Start-->(.*?)<!--详细行程End-->', Response_html, re.S)[0]))
if "minPrice" in Response_html:
price = re.findall(r'"minPrice":(.*?),', Response_html)[0] + str('元')
elif 'ProductMinPrice' in Response_html:
price = re.findall(r'"ProductMinPrice:"(.*?)",', Response_html)[0] + str('元')
else:
price = ''
title = re.findall(r'<h1 itemprop="name">(.*?)<', Response_html, re.S)[0].strip()
data = {
'FEATURE': product_feature,
'DETAILED':detailed,
'PRICE': price,
'TITLE':title,
}
return data
except Exception as error:
# print('----error----',error)
return None
# 主方法
def main(self):
html = self.send_request(self.url)
links = self.parse_place_link(html)
for link in links:
detail_page = self.send_request(link)
detail_links = self.parse_detail_page_link(detail_page)
for detail_link in detail_links:
PageHtml = self.send_request(detail_link)
data = self.parse_page_info(PageHtml)
print(data)
if __name__ == "__main__":
xiecheng = XieCheng()
xiecheng.main()
爬虫比较简单,数据类型也比较单一,但是流程是差不多的