以下爬虫数据为2345影视的经典电影
1.爬取数据写入csv文件
#coding:utf-8
import csv
class getcsv:
#单线程用此方法
def write_csv(self,file_path,title,datas):
with open(file_path, 'a', encoding='utf-8', newline='') as f:
# rows=f.readlines()
# for row in rows:
try:
csv_write = csv.writer(f, dialect='excel')
# 写入列名
csv_write.writerow(title)
# 写入多行
csv_write.writerows(datas)
except FileNotFoundError as e:
print('指定的文件无法打开')
except IOError as e:
print(e)
print("文件写入错误")
#多线程用此方法
def write_csv1(self,file_path,datas):
with open(file_path, 'a', encoding='utf-8', newline='') as f:
# rows=f.readlines()
# for row in rows:
try:
csv_write = csv.writer(f, dialect='excel')
# 写入列名
# csv_write.writerow(title)
# 写入多行
csv_write.writerows(datas)
except FileNotFoundError as e:
print('指定的文件无法打开')
except IOError as e:
print(e)
print("文件写入错误")
if __name__ == '__main__':
a=getcsv()
path= "../data/data.csv"
titie=['a','b','c']
datas=[[1,2,3],[4,5,6]]
print(type(datas[0]))
2.多线程爬取数据代码
#coding:utf-8
'''
先单后多
一个线程用获取详情页的url GetDetail 1 100
多个线程用于请求并解析详情页 ParseDetail 5 20
'''
import requests
from lxml import etree
from urllib.parse import urljoin #url拼接
import threading
from queue import Queue #队列
from base.getcsv import getcsv
#python多线程时候IO密集型程序
# 共享变量
flag = False
# 锁
lock = threading.Lock()
def get_html(url):
'''
获取源代码
:param url:
:return:
'''
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"}
response = requests.get(url, headers=headers) # 这段请求的时间被节省下来
return response.text if str(response.status_code).startswith("2") else None
def get_detail_url_list(url):
'''
获取电影详情页url
:param url:
:return:
'''
html = get_html(url)
if not html:
print("Response状态码不是200")
return
parser = etree.HTML(get_html(url))
# 详情页的路径数量
detail_urls = [urljoin(url, i) for i in
parser.xpath("//ul[contains(@class,'v_picTxt')]/li/div[@class='pic']/a/@href")]
# 翻页去最后一页
next_page_tag = parser.xpath("//div[@class='v_page']/a[last()]")
if next_page_tag:
next_page_tag = next_page_tag[0]
next_page_url = urljoin(url, next_page_tag.attrib.get("href")) if "下一页" in next_page_tag.text else None
return next_page_url, detail_urls
def parse_detail(url):
'''
解析详情页的url
xpath定位是从1开始的
:param url:
:return:
'''
html = get_html(url)
if not html:
print("Response状态码不是200")
return
Etree = etree.HTML(html)
title = "".join(Etree.xpath("//div[@class='tit']/h1/text()"))
# print(type(title))
# score = Etree.xpath("//div[@class='tit']/p/em/text()")[0].replace("分", "")
actor = " ".join(Etree.xpath("//ul[contains(@class,'txtList')]/li[contains(@class,'liActor')]/a/text()"))
director = "".join(Etree.xpath("//ul[contains(@class,'txtList')]/li[@class='li_3']/a/text()"))
introduction = "".join(Etree.xpath("//ul[contains(@class,'newIntro')]/li/p/span/text()"))
# 多个list合并成一个list
list = [title, actor, director, introduction]
lists = list_split(list, 4)
return lists
# 切割列表
def list_split( items, n):
return [items[i:i + n] for i in range(0, len(items), n)]
#继承threading.Thread,通过队列让GetDetailUd和ParseDetail进行通信,
class GetDetailUrlsThread(threading.Thread):
#队列需要传入到这个类中,需要构造方法;不知道具体参数用*args,可以代表多个参数
def __init__(self,queue,url,*args,**kwargs):
#使用继承的继承方法
super().__init__(*args,**kwargs)
self.queue=queue
self.url=url
def run(self):
next_url = self.url
# 详情页,下一页结果
while next_url:
#获取所有详情页路径和下一页路径
next_url, urls = get_detail_url_list(next_url)
#所有详情页路径放在队列里面
for url in urls:
self.queue.put(url)
global flag
'''
使用Thread对象的Lock和Rlock可以实现简单的线程同步,这两个对象都有acquire 方法和release方法,
对于那些需要每次只允许一个线程操作的数据,可以将其操作放到acquire和release方法之间
'''
# 获取锁
lock.acquire()
# 线程运行完后,flag=True
flag=True
#释放锁
lock.release()
class ParseDetailThread(threading.Thread):
def __init__(self,queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.queue = queue
def run(self):
# title = ['电影名', '演员', '导演', '简介']
#一个解析页会处理多个请求
while True:
#如果队列为空,返回true,直接返回
if self.queue.empty() and flag==True:
# print(self.data)
return
#获取详情页的url
url=self.queue.get()
#解析详情页的信息
self.data=parse_detail(url)
msglist = self.data.copy()
path = '../data/thread_data.csv'
#复制数组到msglist数组
getcsv().write_csv1(path, msglist)
def spider():
global i
'''
queue 模块即队列,特别适合处理信息在多个线程间安全交换的多线程程序中。
maxsize是一个整数,用于设置队列的最大长度。一旦队列达到上限,
插入数据将会被阻塞,直到有数据出队列之后才可以继续插入。
如果maxsize设置为小于或等于零,则队列的长度没有限制。
'''
# 声明队列
q=Queue(maxsize=100)
next_url = "https://dianying.2345.com/list/jingdian-------.html"
#声明线程
t1=GetDetailUrlsThread(queue=q,url=next_url)
th=[t1]
data=[]
#配置5个ParseDetail线程
for i in range(5):
t = ParseDetailThread(q)
th.append(t)
for i in th:
#开启线程执行
i.start()
for i in th:
#阻塞等该线程执行完,再执行后面的业务逻辑
i.join()
if __name__ == '__main__':
import time
start_time=time.time()
spider()
#指定csv文件列名信息
import pandas as pd
name1=['电影名','演员','导演','简介']
df=pd.read_csv('../data/thread_data.csv', header=None, names=name1)
df.to_csv('../data/thread_data.csv',index=False)
# print("sum_time={}".format(time.time()-start_time))