Python爬虫爬取标题及内容

清忖灬

已于 2023-01-18 14:44:20 修改

阅读量8.9k

点赞数 11

分类专栏： python爬虫文章标签：爬虫

于 2022-09-21 13:47:45 首次发布

本文链接：https://blog.csdn.net/m0_60964321/article/details/126970909

版权

python爬虫专栏收录该内容

5 篇文章 15 订阅

订阅专栏

一、准备工作

二、代码实现

三、总结

一、准备工作

爬取某博客标题，通过进入网址可得如下页面

F12打开源码

可以看到新闻标题信息保存在<a></a>里面，可以用正则和soup去匹配

二、代码实现

本次爬虫主要用到了如下库

import re
import time
import requests
from bs4 import BeautifulSoup
import myresponse

各个爬虫的方法都是大同小异

这里首先将一些常规方法封装成包，其中的user-agent和cookie就是在爬虫中使用的伪装方法，cookie主要是用在需要登陆信息的页面，通过抓取我们已经登陆过的cookie就可以实现爬虫啦

def getResponse(baseurl):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
        "cookie": "Hm_lvt_af43f8d0f4624bbf72abe037042ebff4=1640837022; __gads=ID=a34c31647ad9e765-22ab388e9bd6009c:T=1637739267:S=ALNI_MYCjel4B8u2HShqgmXs8VNhk1NFuw; __utmc=66375729; __utmz=66375729.1663684462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gpi=UID=000004c822cf58b2:T=1649774466:RT=1663684463:S=ALNI_Ma3kL14WadtyLP-_lSQquhy_w85ag; __utma=66375729.1148601284.1603116839.1663684462.1663687392.2; .Cnblogs.AspNetCore.Cookies=CfDJ8NfDHj8mnYFAmPyhfXwJojexiKc4NcOPoFywr0vQbiMK4dqoay5vz8olTO_g9ZwQB7LGND5BBPtP2AT24aKeO4CP01olhQxu4EsHxzPVjGiKFlwdzRRDSWcwUr12xGxR89b_HFIQnmL9u9FgqjF6CI8canpEYxvgxZlNjSlBxDcWOzuMTVqozYVTanS-vAUSOZvdUz8T2XVahf8CQIZp6i3JzSkaaGUrXzEAEYMnyPOm5UnDjXcxAW00qwVmfLNW9XO_ITD7GVLrOg-gt7NFWHE29L9ejbNjMLECBdvHspokli6M78tCC5gmdvetlWl-ifnG5PpL7vNNFGYVofGfAZvn27iOXHTdHlEizWiD83icbe9URBCBk4pMi4OSRhDl4Sf9XASm7XKY7PnrAZTMz8pvm0ngsMVaqPfCyPZ5Djz1QvKgQX3OVFpIvUGpiH3orBfr9f6YmA7PB-T62tb45AZ3DB8ADTM4QcahO6lnjjSEyBVSUwtR21Vxl0RsguWdHJJfNq5C5YMp4QS0BfjvpL-OvdszY7Vy6o2B5VCo3Jic; .CNBlogsCookie=71474A3A63B98D6DA483CA38404D82454FB23891EE5F8CC0F5490642339788071575E9E95E785BF883C1E6A639CD61AC99F33702EF6E82F51D55D16AD9EBD615D26B40C1224701F927D6CD4F67B7375C7CC713BD; _ga_3Q0DVSGN10=GS1.1.1663687371.1.1.1663687557.1.0.0; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1662692547,1663250719,1663417166,1663687558; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1663687558; _ga=GA1.2.1148601284.1603116839; _gid=GA1.2.444836177.1663687558; __utmt=1; __utmb=66375729.11.10.1663687392"}
    response = requests.get(baseurl, headers=head) # 获取网页信息
    response.encoding = 'utf-8'
    html = response.text
    return html

下面是第一步：获取网页部分，其中myresponse就是上面提到的封装的包啦。注意这里是要填入博客园url地址

html = myresponse.getResponse('你要爬取的url地址')
urls = []
file_title = []
soup = BeautifulSoup(html, "html.parser")  # BeautifulSoup解析html

第二步：解析网页这里用到了soup先遍历整个tree，定位到标题都在<h2 class="news_entry">下面，把含有这些标志的tree内容先拿出来；然后用正则去匹配处理过的tree

href = re.compile(r'a.*href="(.*?)"')
title = re.compile(r'<a.* target="_blank">(.*?)</a>')
for item in soup.find_all('h2', class_="news_entry"):
        #print(item)
        h = re.findall(href, str(item))
        t = re.findall(title, str(item))
        # time = re.findall(time, str(item))
        # print('https://news.cnblogs.com/' + str(h))
        # file_title.append(t)
        for i in h:
            # print('https://news.cnblogs.com/' + str(i))
            urls.append('网址前缀http协议' + str(i))
        for i in t:
            file_title.append(i)

至此，新闻标题和链接都爬下来啦

下面来看运行结果

那怎么去拿到新闻内容呢通过链接可以得到，点进去链接发现就是一个新的url，又可以重复上述过程，俗称“套娃”

来看代码

con = re.compile(r'<p>(.*?)</p>')
news = []
p = 0
for i in urls:
        # print(i)
        content = myresponse.getResponse(i)
        # print(content)
        soup2 = BeautifulSoup(content, "lxml")
        #print(soup2)
        for it in soup2.find_all('div', id="news_body"):
            # print(it)
            c = re.findall(con, str(it))
            # print("------第{}篇文章------".format(m))
            for j in c:
                # print(j)
                print(j)

最后看看全部代码

import myresponse
import re
import time
import requests
from bs4 import BeautifulSoup
html = myresponse.getResponse('某博客网址链接')
href = re.compile(r'a.*href="(.*?)"')
title = re.compile(r'<a.* target="_blank">(.*?)</a>')
urls = []
file_title = []
soup = BeautifulSoup(html, "html.parser")  # BeautifulSoup解析html
 # print(soup)
for item in soup.find_all('h2', class_="news_entry"):
        #print(item)
        h = re.findall(href, str(item))
        t = re.findall(title, str(item))
        # time = re.findall(time, str(item))
        # print('https://news.cnblogs.com/' + str(h))
        # file_title.append(t)
        for i in h:
            urls.append('http协议，前缀' + str(i))
        for i in t:
            file_title.append(i)
print(urls)
print(file_title)

con = re.compile(r'<p>(.*?)</p>')
news = []
p = 0
for i in urls:
        # print(i)
        content = myresponse.getResponse(i)
        # print(content)
        soup2 = BeautifulSoup(content, "lxml")
        #print(soup2)
        for it in soup2.find_all('div', id="news_body"):
            # print(it)
            c = re.findall(con, str(it))
            # print("------第{}篇文章------".format(m))
            for j in c:
                # print(j)
                print(j)