采集程序
熟悉xpath和bs4的用法
coding:utf-8
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent #这是一个随机返回一个UA头的模块
from lxml import etree
ua = UserAgent()
headers = {
#‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36’
‘User-Agent’:ua.random
}
for i in range(1,2):
time.sleep(1)
#print(‘第’ + str(i) + ‘页’)
url = ‘https://www.xicidaili.com/nn/’ + str(i)
response = requests.get(url=url, headers=headers)
‘’’
res = etree.HTML(response.text)
trs = res.xpath(’//div[@class=“clearfix proxies”]/table[@id=“ip_list”]’)
for tr in trs:
ips = tr.xpath(’./tr/td[2]/text()’) # 一个ip列表
ports = tr.xpath