目录
一、准备工作
爬取某博客标题,通过进入网址可得如下页面
F12打开源码
可以看到新闻标题信息保存在<a></a>里面,可以用正则和soup去匹配
二、代码实现
本次爬虫主要用到了如下库
import re
import time
import requests
from bs4 import BeautifulSoup
import myresponse
各个爬虫的方法都是大同小异
这里首先将一些常规方法封装成包,其中的user-agent和cookie就是在爬虫中使用的伪装方法,cookie主要是用在需要登陆信息的页面,通过抓取我们已经登陆过的cookie就可以实现爬虫啦
def getResponse(baseurl):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
"cookie": "Hm_lvt_af43f8d0f4624bbf72abe037042ebff4=1640837022; __gads=ID=a34c31647ad9e765-22ab388e9bd6009c:T=1637739267:S=ALNI_MYCjel4B8u2HShqgmXs8VNhk1NFuw; __utmc=66375729; __utmz=66375729.1663684462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gpi=UID=000004c822cf58b2:T=1649774466:RT=1663684463:S=ALNI_Ma3kL14WadtyLP-_lSQquhy_w85ag; __utma=66375729.1148601284.1603116839.1663684462.1663687392.2; .Cnblogs.AspNetCore.Cookies=CfDJ8NfDHj8mnYFAmPyhfXwJojexiKc4NcOPoFywr0vQbiMK4dqoay5vz8olTO_g9ZwQB7LGND5BBPtP2AT24aKeO4CP01olhQxu4EsHxzPVjGiKFlwdzRRDSWcwUr12xGxR89b_HFIQnmL9u9FgqjF6CI8canpEYxvgxZlNjSlBxDcWOzuMTVqozYVTanS-vAUSOZvdUz8T2XVahf8CQIZp6i3JzSkaaGUrXzEAEYMnyPOm5UnDjXcxAW00qwVmfLNW9XO_ITD7GVLrOg-gt7NFWHE29L9ejbNjMLECBdvHspokli6M78tCC5gmdvetlWl-ifnG5PpL7vNNFGYVofGfAZvn27iOXHTdHlEizWiD83icbe9URBCBk4pMi4OSRhDl4Sf9XASm7XKY7PnrAZTMz8pvm0ngsMVaqPfCyPZ5Djz1QvKgQX3OVFpIvUGpiH3orBfr9f6YmA7PB-T62tb45AZ3DB8ADTM4QcahO6lnjjSEyBVSUwtR21Vxl0RsguWdHJJfNq5C5YMp4QS0BfjvpL-OvdszY7Vy6o2B5VCo3Jic; .CNBlogsCookie=71474A3A63B98D6DA483CA38404D82454FB23891EE5F8CC0F5490642339788071575E9E95E785BF883C1E6A639CD61AC99F33702EF6E82F51D55D16AD9EBD615D26B40C1224701F927D6CD4F67B7375C7CC713BD; _ga_3Q0DVSGN10=GS1.1.1663687371.1.1.1663687557.1.0.0; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1662692547,1663250719,1663417166,1663687558; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1663687558; _ga=GA1.2.1148601284.1603116839; _gid=GA1.2.444836177.1663687558; __utmt=1; __utmb=66375729.11.10.1663687392"}
response = requests.get(baseurl, headers=head) # 获取网页信息
response.encoding = 'utf-8'
html = response.text
return html
下面是第一步:获取网页部分,其中myresponse就是上面提到的封装的包啦。注意这里是要填入博客园url地址
html = myresponse.getResponse('你要爬取的url地址')
urls = []
file_title = []
soup = BeautifulSoup(html, "html.parser") # BeautifulSoup解析html
第二步:解析网页 这里用到了soup先遍历整个tree,定位到标题都在<h2 class="news_entry">下面,把含有这些标志的tree内容先拿出来;然后用正则去匹配处理过的tree
href = re.compile(r'a.*href="(.*?)"')
title = re.compile(r'<a.* target="_blank">(.*?)</a>')
for item in soup.find_all('h2', class_="news_entry"):
#print(item)
h = re.findall(href, str(item))
t = re.findall(title, str(item))
# time = re.findall(time, str(item))
# print('https://news.cnblogs.com/' + str(h))
# file_title.append(t)
for i in h:
# print('https://news.cnblogs.com/' + str(i))
urls.append('网址前缀http协议' + str(i))
for i in t:
file_title.append(i)
至此,新闻标题和链接都爬下来啦
下面来看运行结果
那怎么去拿到新闻内容呢通过链接可以得到,点进去链接 发现就是一个新的url,又可以重复上述过程,俗称“套娃”
来看代码
con = re.compile(r'<p>(.*?)</p>')
news = []
p = 0
for i in urls:
# print(i)
content = myresponse.getResponse(i)
# print(content)
soup2 = BeautifulSoup(content, "lxml")
#print(soup2)
for it in soup2.find_all('div', id="news_body"):
# print(it)
c = re.findall(con, str(it))
# print("------第{}篇文章------".format(m))
for j in c:
# print(j)
print(j)
最后看看全部代码
import myresponse
import re
import time
import requests
from bs4 import BeautifulSoup
html = myresponse.getResponse('某博客网址链接')
href = re.compile(r'a.*href="(.*?)"')
title = re.compile(r'<a.* target="_blank">(.*?)</a>')
urls = []
file_title = []
soup = BeautifulSoup(html, "html.parser") # BeautifulSoup解析html
# print(soup)
for item in soup.find_all('h2', class_="news_entry"):
#print(item)
h = re.findall(href, str(item))
t = re.findall(title, str(item))
# time = re.findall(time, str(item))
# print('https://news.cnblogs.com/' + str(h))
# file_title.append(t)
for i in h:
urls.append('http协议,前缀' + str(i))
for i in t:
file_title.append(i)
print(urls)
print(file_title)
con = re.compile(r'<p>(.*?)</p>')
news = []
p = 0
for i in urls:
# print(i)
content = myresponse.getResponse(i)
# print(content)
soup2 = BeautifulSoup(content, "lxml")
#print(soup2)
for it in soup2.find_all('div', id="news_body"):
# print(it)
c = re.findall(con, str(it))
# print("------第{}篇文章------".format(m))
for j in c:
# print(j)
print(j)
三、总结
这次爬取过程和上篇文章中爬取必应图片大同小异常,唯一区别在于对需要登陆页面的处理,加个cookie就好啦,最后有不懂的地方欢迎评论私信哦