from lxml import html
import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin
import time
from threading import Thread
# 获取全部页的网址
def all_url(url):
page = 1
while page <= 50: # 这里感觉有问题,会导致重复写入内容
urls = r"https://3g.163.com/touch/news?referFrom=" # r防止转义 + str(page) + ".html"
print("正在爬取第%d页。" % page)
print(urls)
page = page + 1
get_one_page(urls)
# 获取每页源代码
def get_one_page(urls):
headers = {'user-agent': 'Mozilla/5.0'}
r = requests.get(urls, headers=headers)
r.encoding = r.apparent_encoding
ht = r.text
parse_one_page(ht)
# 解析每页网页源代码,并获取新闻链接与新闻标题
def parse_one_page(ht):
count = 0
tree = html.fromstring(ht)
news = tree.xpath("//div[contains(@class, 'tab-content')]//article/a/@href")
# thre
尝试先爬取新闻链接,然后爬取链接后的正文
最新推荐文章于 2024-08-03 19:27:22 发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)