本博客仅用于技术讨论,若有侵权,联系笔者删除。
此次的目的是爬取国内常见网站的基本信息。爬取的黄页是网站列表,爬取了三万多条数据。以下是结果图:
一、代理IP
由于一个IP重复请求多次后服务器会不响应,所以此处笔者采用代理IP的方式。代码如下:
#获取代理IP列表
def get_ip_list(url_now, headers):
web_data = requests.get(url_now, headers=headers,timeout=9000)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
#从列表中随机获取一个IP
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
二、数据初始化
初始化一些基本数据,比如URL的公共部分、headers、workbook以及用来记录行数的row_now:
from bs4 import BeautifulSoup
import requests, xlwt, time, random
#初始化数据
def init():
global url, headers, workbook, table, row_now
url = 'https://top.chinaz.com/all/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
workbook = xlwt.Workbook(encoding='utf-8')
table=workbook.add_sheet("name",cell_overwrite_ok=True)
value=["网站名称", "网站地址", "网站简介"]
for i in range(len(value)):
table.write(0,i,value[i])
row_now = 0
三、获取当前页的数据,并存入table
获取当前页的数据,将代码分析后获取数据,并存入table:
#获取当前页数据
def get_now_page_data(now_page,proxies):
print(proxies, end='\t')
req = requests.get(url+now_page,headers=headers,proxies=proxies,timeout=9000)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
else:
encoding = req.encoding
encode_content = req.content.decode(encoding, 'ignore').encode('utf-8', 'ignore')
soup = BeautifulSoup(encode_content, "lxml")
message = soup.find_all('div', class_='CentTxt')
global row_now
for data in message:
table.write(row_now, 0, data.h3.a.text)
table.write(row_now, 1, data.h3.span.text)
table.write(row_now, 2, data.find('p', class_='RtCInfo').text.split(':')[1])
row_now = row_now + 1
四、主函数
主函数包括对页的循环操作,并将table保存为excel。由于过度频繁的请求代理IP也会引起服务器不响应,所以笔者采用每十页换一次代理IP的办法:
#主函数
if __name__ == '__main__':
init()
ip_list = get_ip_list('http://www.xicidaili.com/nn/', headers=headers)
proxies = get_random_ip(ip_list)
print("开始第1页",end='\t')
get_now_page_data('index.html',proxies)
print("第1页结束")
for now_page in range(1,1001):
print("开始第"+str(now_page+1)+"页",end='\t')
if now_page%10 == 0:
ip_list = get_ip_list('http://www.xicidaili.com/nn/', headers=headers)
proxies = get_random_ip(ip_list)
get_now_page_data('index_'+str(now_page)+'.html',proxies)
print("第"+str(now_page+1)+"页结束")
workbook.save('./常见域名爬虫/常见域名列表.xls')