python爱心代码点燃我温暖你

大家好,小编来为大家解答以下问题,python爱心表白代码简单,python爱心代码怎么运行,今天让我们一起来看看吧!

一、预备知识

处理网页的链接(只列出一种)

# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

从网页下载到本地txt的模块

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = ('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

正则表达式知识(最好要有)

二、静态爬取

静态网页爬取,就是通过网页源码定位到你所要爬取的内容,获取内容的标签和属性,再通过爬虫爬取出来即可。本段代码思想:以新浪新闻网为例,通过url的findAll()函数找到所有符合的网址,从头遍历。进入一个网址后如果存在内容,则下载内容和标题,并且再次寻找该网页内符合条件的url,否则该网页不符合条件,退出该网页。

from bs4 import BeautifulSoup
from urllib import request

# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = ('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for tag in All("a", target="_blank"):
        if tag.string != None:	#标题非空
            if len(tag.string) > 8: # 标题长度大于8
                if(("" in tag.attrs["href"]) or ("" in tag.attrs["href"])):
                    alllist.append(tag.attrs["href"])
                    if ((tag.attrs["href"] not in collection)):
                        (tag.attrs["href"])
                        try:
                            print(tag.attrs['href'])
                            download(tag.string, tag.attrs['href'], y)
                            y += 1
                        except Exception:
                            print("第" + str(y) + "个新闻爬取失败")
                        else:
                            crawlAll(tag.attrs['href'], y)
    return y

if __name__ == '__main__':
    y = 1
    collection = set() # 用于链接去重
    alllist = set()	# 用于存放你需要爬取的网页
    alllist = [""]
    for n in alllist:
        target_url = n
        y = crawlAll(target_url, y)

三、动态爬取

爬取动态网页则更为复杂,本次介绍一种爬取动态加载的方法。需要用到:postman。 本次实验的网址是: 假设新浪新闻社会板块滚动页面为动态加载页面:;lid=2669&k=&num=50&page=1 由于无法用静态方法爬取,所以需要对网页进行抓包。

打开网页后,右键检查-点击网络(NETWORK)-点击网页的下一页-显示出有一个请求

右键该包-copy-copy as cURL(bash)

放入postman中,import-raw text,导入后可以点击send尝试发送请求,如果出现了内容则抓成功了。

点击code-python requests则可以复制代码到pycharm中粘贴,一般修改page即可达到抓取动态加载页面的效果。

import requests

url = ";lid=2669&k=&num=50&page=2&r=0.呵呵86394&callback=jQuery1112024604807702249287_1604838144359&_=1604838144361"

payload={}
headers = {
  'authority': '',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
  'accept': '*/*',
  'sec-fetch-site': 'same-site',
  'sec-fetch-mode': 'no-cors',
  'sec-fetch-dest': '',
  'referer': '',
  'accept-language': 'zh-CN,zh;q=0.9',
}

response = requests.request("GET", url, headers=headers, data=payload)

print()

具体代码如下:

import re
from bs4 import BeautifulSoup
from urllib import request
import requests

# 动态获取网页下符合条件的链接
def solve(page):
    url1 = ";lid=2669&k=&num=50&page="
    url2 = "&r=0.7488014654950375&callback=jQuery1112025760955190502766_1604665024595&_=1604665024597"
    url = url1 + str(page) + url2
    payload = {}
    headers = {
      'authority': '',
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
      'accept': '*/*',
      'sec-fetch-site': 'same-site',
      'sec-fetch-mode': 'no-cors',
      'sec-fetch-dest': '',
      'referer': '',
      'accept-language': 'zh-CN,zh;q=0.9',
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    response.encoding = "utf-8"
    l1 = str(.replace("\\", "").split())
    res = re.findall(r'"url":"([^"]+)"', l1)
    return res

# 用requeset和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = ('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

if __name__ == '__main__':
    y = 1
    # 该网页50页以后会出错
    for page in range(50):
        url = solve(page)	
        for each in url:
            soup = requestOver(each)
            download(("h1", class_="main-title").string, each, y)
            y += 1

四、新浪新闻API

全部 ;lid=2509&k=&num=50&page=1 国内 ;lid=2510&k=&num=50&page=1 国际 ;lid=2511&k=&num=50&page=1 社会 ;lid=2669&k=&num=50&page=1 体育 ;lid=2512&k=&num=50&page=1 娱乐 ;lid=2513&k=&num=50&page=1 军事 ;lid=2514&k=&num=50&page=1 科技 ;lid=2515&k=&num=50&page=1 财经 ;lid=2516&k=&num=50&page=1 股市 ;lid=2517&k=&num=50&page=1 美股 ;lid=2518&k=&num=50&page=1

五、题外话

当然你也可以选择静态用scrapy,动态用selenium。这些都是爬虫的一些框架和包。

六、静态爬取中国新闻网

由于实验需要,特爬取中国新闻网滚动新闻的财经新闻发现用"lxml"解析爬不到的用"html.parser"可以爬的到需要爬取中国新闻网其他频道的新闻修改相关参数即可因为流程简单,并未制作严谨的异常处理

# -*- coding: utf-8 -*-
# encoding='utf-8'
import re
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from urllib import request
import datetime

# 用request和BeautifulSoup处理网页
def requestOver(url):
    response = (url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(, 'html.parser')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = ('div', class_="left_zw")
    if(tag == None):
        return 0
    # print(type(tag))
    # print(tag.get_text())
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\eco\\' + title + '.txt'
    with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for s in All("div", class_="content_list"):
        for tag in s.findAll("li"):
            sp = tag.findAll("a")
            if("财经" in str(sp)):
                title = list(sp)[1].string
                urlAll = "" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
                try:
                    download(title, urlAll, y)
                except Exception:
                    print("第" + str(y) + "个新闻爬取失败")
                else:
                    y += 1
    return y

if __name__ == '__main__':
    y = 1
    url1 = ""
    date = "2020/1112"
    url2 = "/news.shtml"
    for i in range(3650):
        date1 = datetime.datetime.strptime(date, "%Y/%m%d")
        date2 = datetime.timedelta(days=-1)
        date = (date1 + date2).strftime("%Y/%m%d")
        target_url = url1 + date + url2
        print(target_url)
        y = crawlAll(target_url, y)


原文地址:https://blog.csdn.net/weixin_44485744/article/details/109563474

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值