最容易让新手看懂上手的语言python,最强大的爬虫模块requests+最便捷的节点提取方式xpath
from retrying import retry from lxml import etree import requests import re import os class Spider(object): def __init__(self): self.headers = { '''模拟浏览器,防反爬,同理可以加上refer与cookie''' "User_Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } '''尝试五次''' @retry(stop_max_attempt_number=5) def _parse_url(self, url): try: response = requests.get(url, headers=self.headers).content.decode() except: return "" else: