爬虫
第一步:考虑用啥方法?
- 找接口
- requests
- selenium
第二步:分析网页结构和需要爬取的数据
1.普通网页内容
2.需要拉到最下面才能拉完一页的(像京东网页)
height=800
for _ in range(13):
b.execute_script(f'window.scrollTo(0,{height})')
height+=800
time.sleep(1)
3.获取到主页信息,需要点击每项进去获取每项内容的
for a in search_result:
a.click()
time.sleep(2)
b.switch_to.window(b.window_handles[-1])
soup=BeautifulSoup(b.page_source,'lxml')
names=soup.select_one('.wx-tit> h1').text
print(names)
b.close()
b.switch_to.window(b.window_handles[0])
第三步:(3种方法的分别使用)
1.找接口(单页)
import requests
import json
import csv
res=requests.get('https://lewan.baidu.com/lewanapi?action=aladdin_rank_games&gameSource=standalone')
reslut = res.json()
list1=[]
for item in reslut['result']['data']['annual']:
names=item['gameName']
score=item['gameScore']
types=item['gameTypes']
index1=item['gameQueryIndex']
list1.append([names,score,types,index1])
writer1=csv.writer(open('files/games.csv','wt',encoding='utf-8',newline=''))
writer1.writerow(['names','score','types','index1'])
writer1.writerows(list1)
2.requests(单页)
import requests
from bs4 import BeautifulSoup
from re import *
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
soup1=BeautifulSoup(b.page_source,'lxml')
divs=soup1.select('.mod_figure.mod_figure_v_default.mod_figure_list_box>div')
list1=[]
for div in divs:
names=div.select_one('.list_item>a').attrs['title']
titles=div.select_one('.list_item>div>div').attrs['title']
pic=div.select_one('.list_item img').attrs['src']
list1.append([names,titles,pic])
writer1=csv.writer(open('files/tencent_variety.csv','w',encoding='utf-8',newline=''))
writer1.writerow(['综艺名','主题','封面'])
writer1.writerows(list1)
3.selenium(单页)
from selenium.webdriver import ChromeOptions,Chrome
from re import *
import csv
from bs4 import BeautifulSoup
options= ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b=Chrome(options=options)
b.get('输入网址')
search_box=b.find_element_by_id('搜索框id值')
search_box.send_keys('搜索内容')
search_box.send_keys(Keys.ENTER)
time.sleep(2)
soup1=BeautifulSoup(b.page_source,'lxml')
divs=soup1.select('.mod_figure.mod_figure_v_default.mod_figure_list_box>div')
list1=[]
for div in divs:
names=div.select_one('.list_item>a').attrs['title']
titles=div.select_one('.list_item>div>div').attrs['title']
pic=div.select_one('.list_item img').attrs['src']
list1.append([names,titles,pic])
writer1=csv.writer(open('files/tencent_variety.csv','w',encoding='utf-8',newline=''))
writer1.writerow(['综艺名','主题','封面'])
writer1.writerows(list1)
第四步(获取多页)
第二步:点击下一页,来到第二页,继续向下滚动,获取第二页内容...直到最后一页(这是重复操作)
def fun2():
next_btn=b.find_element_by_class_name('pn-next')
next_btn.click()
height = 800
for _ in range(12):
b.execute_script(f'window.scrollTo(0,{height})')
height += 800
time.sleep(2)
soup1 = BeautifulSoup(b.page_source, 'lxml')
lis = soup1.select('.gl-warp.clearfix>li')
list1 = []
for li in lis:
price = li.select_one('div.p-price > strong > i').text
try:
shop_name=li.select_one('a.curr-shop.hd-shopname').text
except:
shop_name=''
list1.append([price, shop_name])
writer1 = csv.writer(open('files/京东手机.csv', 'a', encoding='utf-8', newline=''))
writer1.writerows(list1)
time.sleep(2)