爬虫复习
第一步:获取网页数据
流程:确定目标网站 -> 确定技术(直接requests发送(加user-agent、加cookie)、找数据接口、使用selenium)
反爬:浏览器身份验证反爬(加user-agent)、登录反爬(requests的headers中添加cookie、selenium加cookie)、封IP反爬
第二步:解析数据
正则
bs4(基于css选择器的解析器)
lxml(基于xpath的解析器)
第三步:保存数据
csv、excel
1.多线程
1)主线程
一个进程默认有且只有一个线程,这个线程叫主线程;
默认情况下所有的代码都是在主线程中执行的。
2)子线程
除了主线程以外的线程都叫子线程,所有的子线程都是由程序员通过Thread和Thread的子类创建出来。
程序中如果需要子线程需要先创建,需要几个就创建几个
from datetime import datetime
import time
from threading import current_thread, Thread
def download(name):
print(f'位置2:{current_thread()}')
print(f'{name}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{name}下载完成:{datetime.now()}')
if __name__ == '__main__':
# 1. 单线程下载三个电影
# print(f'位置1:{current_thread()}')
# download('肖申克的救赎')
# download('霸王别姬')
# download('阿甘正传')
# 2. 使用三个线程下载三个电影
# 创建线程对象
t1 = Thread(target=download, args=('肖申克的救赎',))
t2 = Thread(target=download, args=('霸王别姬',))
t3 = Thread(target=download, args=('阿甘正传',))
# 启动线程
t1.start()
t2.start()
t3.start()
1) 创建子线程: Thread(target=函数, args=元组)
函数 - 需要在子线程中调用执行的函数
元组 - 元组中的元素就是函数在调用的时候需要的实参
返回值:子线程对象
2) 启动线程
线程对象.start()
# 练习:用多线程同时下载top250 10页数据并且解析
from threading import Thread
import requests
import time
from datetime import datetime
def get_html(start):
print(f'开始:{datetime.now()}')
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
response = requests.get(url, headers=headers)
analyse_data(response.text)
def analyse_data(data):
pass
if __name__ == '__main__':
for start in range(0, 250, 25):
t = Thread(target=get_html, args=(start,))
t.start()
selenium使用多线程
from selenium.webdriver import Chrome
from threading import Thread
from selenium.webdriver.common.keys import Keys
def get_net_data(g):
b = Chrome()
b.get('https://www.jd.com')
b.implicitly_wait(5)
search = b.find_element_by_id('key')
search.send_keys(g)
search.send_keys(Keys.ENTER)
input('')
if __name__ == '__main__':
goods = ['零食', '笔记本电脑', '红包']
for g in goods:
t = Thread(target=get_net_data, args=(g,))
t.start()
多进程
from multiprocessing import Process, current_process
from datetime import datetime
import time
# Process是系统通过multiprocessing提供的进程类,什么时候需要子进程,什么时候就创建Process的对象
# 每个程序默认的进程是主进程
def download(name):
print(f'{name}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{name}下载完成:{datetime.now()}')
if __name__ == '__main__':
for x in range(10):
# 创建子进程
"""
Process(target=函数, args=元组)
函数 - 需要在子进程中去执行的函数
元组 - 函数在子进程中调用的时候的实参
返回值:子进程对象
"""
p = Process(target=download, args=(f'电影{x}',))
p.start()
线程类子类
from threading import Thread, current_thread
from datetime import datetime
import time
from random import randint
class DowloadThread(Thread):
def __init__(self, name):
super().__init__()
self.name = name
# 这个run方法会在线程启动的时候自动调用
def run(self) -> None:
print(f'线程:{current_thread()}')
print(f'{self.name}开始下载:{datetime.now()}')
time.sleep(randint(3, 7))
print(f'{self.name}下载结束:{datetime.now()}')
if __name__ == '__main__':
t1 = DowloadThread('肖生克的救赎')
t2 = DowloadThread('霸王别姬')
t1.start()
t2.start()
进程类子类
import time
from datetime import datetime
from random import randint
from multiprocessing import Process
class DowloadProcess(Process):
def __init__(self, name):
super().__init__()
self.name = name
def run(self) -> None:
print(f'{self.name}开始下载:{datetime.now()}')
time.sleep(randint(2, 4))
print(f'{self.name}完成下载:{datetime.now()}')
if __name__ == '__main__':
t1 = DowloadProcess('救赎')
t2 = DowloadProcess('火影忍者')
t1.start()
t2.start()
等待
from threading import Thread
from datetime import datetime
import time
from random import randint
def download(name):
print(f'{name}开始下载:{datetime.now()}')
time.sleep(randint(3, 7))
print(f'{name}下载完成:{datetime.now()}')
if __name__ == '__main__':
t1 = Thread(target=download, args=('这个杀手不太冷',))
t2 = Thread(target=download, args=('小人物',))
t3 = Thread(target=download, args=('冒牌天神',))
# 第一个电影下载完成后再下载第二个和第三个电影
t1.start()
t1.join()
t2.start()
t3.start()
# 需求:在所有电影都下载结束的时候,打印 '下载完成!'
# 1.join的用法
"""
线程对象.join()
其他代码
其他代码会在线程对象的任务完成后执行
"""
t1.join()
t2.join()
t3.join()
print('下载完成!')
循环下载中的等待
from threading import Thread
from datetime import datetime
import time
from random import randint
def download(name):
print(f'{name}开始下载:{datetime.now()}')
time.sleep(randint(3, 7))
print(f'{name}下载完成:{datetime.now()}')
if __name__ == '__main__':
names = [f'电影{x}' for x in range(1, 11)]
ts = []
for name in names:
t = Thread(target=download, args=(name,))
t.start()
ts.append(t)
for t in ts:
t.join()
# 在所有电影都下载完成后打印完成
print('全部下载完成!')
线程间通信
# 同一个进程中的多个线程中的数据可以直接共享
from threading import Thread
age = 19
nums = []
def change_age():
global age
age = age + 1
def func1():
a = input('请输入一个整数:')
nums.append(a)
def func2():
b = input('请输入一个浮点数')
nums.append(b)
if __name__ == '__main__':
change_age()
t = Thread(target=change_age)
t.start()
print(age)
t1 = Thread(target=func1)
t2 = Thread(target=func2)
t1.start()
t2.start()
t1.join()
t2.join()
print(nums)
作业豆瓣多线程抓取
from threading import Thread
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from re import sub
import csv
def get_html(start):
print(f'开始:{datetime.now()}')
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'cookie': 'll="118318"; bid=up0LZM0sOso; __utmc=30149280; __utmc=223695111; _vwo_uuid_v2=DA6F707AFE51E7533F988C3EAE8A0C2C2|069f64d1ab604ba4eb08686d28aa07d7; __gads=ID=c43ff1033b829031-22f2e208becf0002:T=1641784036:RT=1641784036:S=ALNI_Mb6K0g7OJuO7c0Qn9lQa34OxGsOWQ; gr_user_id=7e34dfdf-14a4-4eeb-bde8-e708a4889a1d; viewed="35662675_35690661"; dbcl2="223952829:mlln/t/8uR0"; ck=c1Ol; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1642423550%2C%22https%3A%2F%2Fopen.weixin.qq.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1526783641.1641778436.1641951352.1642423551.8; __utmb=30149280.0.10.1642423551; __utmz=30149280.1642423551.8.4.utmcsr=open.weixin.qq.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.97997013.1641778441.1641951352.1642423551.7; __utmb=223695111.0.10.1642423551; __utmz=223695111.1642423551.7.5.utmcsr=open.weixin.qq.com|utmccn=(referral)|utmcmd=referral|utmcct=/; push_doumail_num=0; _pk_id.100001.4cf6=e6f7e3e8f5ada154.1641778442.6.1642423697.1641951352.; push_noty_num=0'
,
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
analyse_data(response.text)
def analyse_data(data):
soup = BeautifulSoup(data, 'lxml')
all_li = soup.select('.grid_view>li')
for li in all_li:
title = li.select_one('.pic>a>img').attrs['alt']
message = li.select_one('.bd>p').text
message = sub(r'\s+', '', message)
scores = li.select_one('.bd>.star>.rating_num').text
comment = li.select('.bd>.star>span')[-1].text
ranking = li.select_one('.pic>em').text
all_data.append([ranking, title, scores, comment, message])
if __name__ == '__main__':
all_data = []
ts = []
for start in range(0, 250, 25):
t = Thread(target=get_html, args=(start,))
t.start()
ts.append(t)
for t in ts:
t.join()
all_data.sort(key=lambda item: int(item[0]))
writer = csv.writer(open('files/豆瓣电影.csv', 'w', newline=''))
writer.writerow(['排名', '名字', '评分', '评价数', '其他信息'])
writer.writerows(all_data)