import re
import xlwt
from urllib import parse
from bs4 import BeautifulSoup
import requests
import random
from lxml import etree
# 从ip代理网站获取ip列表
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text) # 拼接成【ip:端口】的格式
return ip_list
# 在ip列表中随机取出一个ip
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('https://' + ip) # 拼接成网址
proxy_ip = random.choice(proxy_list) # 随机选择一个网址
proxies = {'https': proxy_ip} # proxies的格式是一个字典:{‘http’: ‘http://123.123.321.123:808‘}
return proxies
def main():
import time
start_time = time.time()
b = 0
columns = ['网址', '作者', '发布时间', '标题', '内容', '阅读量', '评论数']
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet('My Worksheet')
for k in range(len(columns)):
worksheet.write(0, k, columns[k])
workbook.save('movie.xls')
a = int(input("请输入页数:"))
for i in range(a):
proxy_url = 'http://ip.yqie.com/proxygaoni/' # ip代理网站
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
ip_list = get_ip_list(proxy_url, headers) # 调用函数get_ip_list 传入参数url和headers,返回一个IP列表
proxies = get_random_ip(ip_list) # 调用函数get_random_ip 传入参数是第一个函数得到的列表,返回一个随机的proxies
print(proxies)
url = "http://guba.eastmoney.com/list,002506_{}.html".format(i)
headers = {'Referer': "http://guba.eastmoney.com",
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0)'
'Gecko/20100101 Firefox/80.0',
}
response0 = requests.get(url=url, headers=headers, proxies=proxies)
news_comment_urls0 = re.findall(r'/news,002506,\S+html', response0.text)
html = etree.HTML(response0.text) # 解析成html
read_num = html.xpath(
'//*[@id="articlelistnew"]/div/span[3]/a[contains(@href,"new")]/../../span[1]/text()') # 阅读数
comment_num = html.xpath(
'//*[@id="articlelistnew"]/div/span[3]/a[contains(@href,"new")]/../../span[2]/text()') # 评论数
a = 0
for comment_url0 in news_comment_urls0:
worksheet.write(b + 1, 5, read_num[a])
worksheet.write(b + 1, 6, comment_num[a])
a+=1
workbook.save('movie.xls')
b += 1
list_url = "http://guba.eastmoney.com"
whole_url0 = parse.urljoin(list_url, comment_url0)
print(whole_url0)
worksheet.write(b, 0, whole_url0)
workbook.save('movie.xls')
response1 = requests.get(whole_url0)
name = re.findall('<font>(.*?)</font>', response1.text)
for nam in name:
print(nam)
worksheet.write(b, 1, name)
workbook.save('movie.xls')
tim = re.findall('<div class="zwfbtime">(.*?)</div>', response1.text)
tim = str(tim)
tim1 = re.findall('\d\d\d\d-\d\d-\d\d', tim)
for second in tim1:
print(second)
worksheet.write(b, 2, second)
workbook.save('movie.xls')
title = re.findall('<title>(.*?)</title>', response1.text)
title = str(title)
title1 = re.findall(r".*'(.*)_协.*", title)
for j in title1:
print(j)
worksheet.write(b, 3, j)
workbook.save('movie.xls')
content = re.findall('<div class="stockcodec .xeditor">(.*?)</div>', response1.text, re.DOTALL)
for i in content:
m = re.sub("[a-zA-Z<>_=.\":/]", '', i)
print(m)
worksheet.write(b, 4, m)
workbook.save('movie.xls')
print(read_num[a - 1])
print(comment_num[a - 1])
end_time = time.time()
print(f"总共需要时间为: {end_time - start_time} s")
print(b)
if __name__ == '__main__':
main()
爬取股票吧评论
最新推荐文章于 2024-05-31 14:16:52 发布