帮群里的一个小朋友写的,这些个名字不是我起的,大学生的作业,勿喷。
第n次更新,加了个获取快代理的免费代理,避免被豆瓣的反爬虫给怼自闭,不过还是有个小bug,就是爬取完成后不会停,如果一直在打印ip代理就手动停止一下吧。收工了,有啥问题可以扫码加我企业微信讨论。
代码如下,仅供参考:
import time
import xlwt
from lxml import etree
import requests
import json
import random
import requests
from bs4 import BeautifulSoup
# 获取网站数据
def get_data(url):
headers = {
'user-agent': 'Mozilla/5.0'
}
html = requests.get(url, headers)
html.encoding = 'utf-8'
return html.text
# 解析网站数据
def parse_dara(html):
soup = BeautifulSoup(html, 'html.parser')
'''
protocol = soup.find_all(attrs={'data-title': '类型'})
ip = soup.find_all(attrs={'data-title': 'IP'})
port = soup.find_all(attrs={'data-title': 'PORT'})
'''
# 协议 地址 端口
protocol = soup.select('#list > table > tbody > tr > td:nth-child(4)')
ip = soup.select('#list > table > tbody > tr > td:nth-child(1)')
port = soup.select('#list > table > tbody > tr > td:nth-child(2)')
data = [] # 存放代理链接
for i in range(0, len(ip)): # 要求len(ip), len(port) len(protocol)的值一样
temp = protocol[i].get_text()+'://'+ip[i].get_text()+':'+port[i].get_text() # 拼接成url
data.append(temp) # 拼接后的数据,加入到列表
return data
# 保存数据
def save_data(data):
for item in data:
with open(proxy, 'a+') as f:
f.write(item)
f.write('\n')
def processing_data(content_list):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel
# 保存
workbook.save('电影信息.xls')
def save_info(s, content):
info = content.xpath("//div[@id='info']")[0]
try:
name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
except:
name = "无"
try:
daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
except:
daoyan = "无"
try:
bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
except:
bianju = "无"
try:
zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
except:
zhuyan = "无"
try:
leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
except:
leixing = "无"
try:
shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
except:
shangyingshijian = "无"
try:
shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
except:
shichang = "无"
try:
pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
except:
pingfen = "无"
try:
jianjie = str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
except:
jianjie = "无"
# tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
try:
pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
except:
pingjiarenshu = "无"
print("爬取第%d部%s年上映,%s主演的%s" % (s, shangyingshijian[0:4], zhuyan, name))
# print("电影名称:", name)
# print("导演:", daoyan)
# print("编剧:", bianju)
# print("主演:", zhuyan)
# print("评分:", pingfen)
# print("评价人数:", pingjiarenshu)
# print("类型:", leixing)
# print("上映时间:", shangyingshijian)
# print("时长:", shichang)
# print("简介:", jianjie)
# print("图片url:", tupian)
one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
all_list.append(one_info)
def main():
s =0
i = 0
try:
for x in range(0, 9999):
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=time&page_limit=20&page_start='+ str(x*20)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
'Cookie': 'bid=8u7taHNdsWM; __utmc=30149280; __utmc=223695111; __utmz=223695111.1607998669.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=9x4B44CN2IsA8mMQ5aAyjQ4SaozNfPF2; __gads=ID=faf2684739e4c7f2-22e5424930c50003:T=1607998670:RT=1607998670:S=ALNI_MYbSVvFUx-vDkas8JkBXbnxevAHWA; ll="118282"; ct=y; _vwo_uuid_v2=DE86177D6BC486F18E203C7287F2B1E77|1fd9d3b9c304cda3f3602953aa741fcc; dbcl2="228452659:QZuIW0RNFQA"; ck=Z6d9; push_noty_num=0; push_doumail_num=0; __utma=30149280.78821852.1607998669.1608094761.1608104129.3; __utmz=30149280.1608104129.3.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmv=30149280.22845; __utmb=30149280.2.10.1608104129; __utma=223695111.1226569761.1607998669.1608094761.1608104244.3; __utmb=223695111.0.10.1608104244; _pk_id.100001.4cf6=1b0982adf0b4c756.1607998669.3.1608104244.1608095066.; _pk_ses.100001.4cf6=*'
}
with open("proxy.txt", "r") as f: # 打开文件
data = f.readlines() # 读取文件
summ = len(data)
proxyss = data[i].replace("\n", "")
i+=1
if i == summ-1:
i = 0
proxies = {'http': '{}'.format(proxyss)}
print(proxies)
content = requests.get(url, proxies=proxies, headers=headers)
if content.status_code != 200:
print('出错了')
content_json = json.loads(content.text)["subjects"]
# if not content_json:
# break
for one_info in content_json:
one_id = one_info["id"]
print(one_id)
url2 = "https://movie.douban.com/subject/%s/" % one_id
# content_html = requests.get(url, headers=headers)
html = requests.get(url2, proxies=proxies, headers=headers)
if html.status_code == 200:
content = html.content.decode("utf-8")
content = etree.HTML(content)
s += 1
save_info(s, content)
else:
print('出错了')
time.sleep(1)
except:
processing_data(all_list)
if __name__ == '__main__':
proxy = 'proxy.txt'
url = 'https://www.kuaidaili.com/free/inha/1'
html = get_data(url)
data = parse_dara(html)
save_data(data)
print('获取免费代理结束')
all_list = []
main()
processing_data(all_list)