大家好,小编来为大家解答以下问题,python爱心表白代码简单,python爱心代码怎么运行,今天让我们一起来看看吧!
一、预备知识
处理网页的链接(只列出一种)
# 用request和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup
从网页下载到本地txt的模块
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = ('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
正则表达式知识(最好要有)
二、静态爬取
静态网页爬取,就是通过网页源码定位到你所要爬取的内容,获取内容的标签和属性,再通过爬虫爬取出来即可。本段代码思想:以新浪新闻网为例,通过url的findAll()函数找到所有符合的网址,从头遍历。进入一个网址后如果存在内容,则下载内容和标题,并且再次寻找该网页内符合条件的url,否则该网页不符合条件,退出该网页。
from bs4 import BeautifulSoup
from urllib import request
# 用request和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = ('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
# 爬虫具体执行过程
def crawlAll(url, y):
soup = requestOver(url)
for tag in All("a", target="_blank"):
if tag.string != None: #标题非空
if len(tag.string) > 8: # 标题长度大于8
if(("" in tag.attrs["href"]) or ("" in tag.attrs["href"])):
alllist.append(tag.attrs["href"])
if ((tag.attrs["href"] not in collection)):
(tag.attrs["href"])
try:
print(tag.attrs['href'])
download(tag.string, tag.attrs['href'], y)
y += 1
except Exception:
print("第" + str(y) + "个新闻爬取失败")
else:
crawlAll(tag.attrs['href'], y)
return y
if __name__ == '__main__':
y = 1
collection = set() # 用于链接去重
alllist = set() # 用于存放你需要爬取的网页
alllist = [""]
for n in alllist:
target_url = n
y = crawlAll(target_url, y)
三、动态爬取
爬取动态网页则更为复杂,本次介绍一种爬取动态加载的方法。需要用到:postman。 本次实验的网址是: 假设新浪新闻社会板块滚动页面为动态加载页面:;lid=2669&k=&num=50&page=1 由于无法用静态方法爬取,所以需要对网页进行抓包。
打开网页后,右键检查-点击网络(NETWORK)-点击网页的下一页-显示出有一个请求
右键该包-copy-copy as cURL(bash)
放入postman中,import-raw text,导入后可以点击send尝试发送请求,如果出现了内容则抓成功了。
点击code-python requests则可以复制代码到pycharm中粘贴,一般修改page即可达到抓取动态加载页面的效果。
import requests
url = ";lid=2669&k=&num=50&page=2&r=0.呵呵86394&callback=jQuery1112024604807702249287_1604838144359&_=1604838144361"
payload={}
headers = {
'authority': '',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'accept': '*/*',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'no-cors',
'sec-fetch-dest': '',
'referer': '',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.request("GET", url, headers=headers, data=payload)
print()
具体代码如下:
import re
from bs4 import BeautifulSoup
from urllib import request
import requests
# 动态获取网页下符合条件的链接
def solve(page):
url1 = ";lid=2669&k=&num=50&page="
url2 = "&r=0.7488014654950375&callback=jQuery1112025760955190502766_1604665024595&_=1604665024597"
url = url1 + str(page) + url2
payload = {}
headers = {
'authority': '',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'accept': '*/*',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'no-cors',
'sec-fetch-dest': '',
'referer': '',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.request("GET", url, headers=headers, data=payload)
response.encoding = "utf-8"
l1 = str(.replace("\\", "").split())
res = re.findall(r'"url":"([^"]+)"', l1)
return res
# 用requeset和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = ('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
if __name__ == '__main__':
y = 1
# 该网页50页以后会出错
for page in range(50):
url = solve(page)
for each in url:
soup = requestOver(each)
download(("h1", class_="main-title").string, each, y)
y += 1
四、新浪新闻API
全部 ;lid=2509&k=&num=50&page=1 国内 ;lid=2510&k=&num=50&page=1 国际 ;lid=2511&k=&num=50&page=1 社会 ;lid=2669&k=&num=50&page=1 体育 ;lid=2512&k=&num=50&page=1 娱乐 ;lid=2513&k=&num=50&page=1 军事 ;lid=2514&k=&num=50&page=1 科技 ;lid=2515&k=&num=50&page=1 财经 ;lid=2516&k=&num=50&page=1 股市 ;lid=2517&k=&num=50&page=1 美股 ;lid=2518&k=&num=50&page=1
五、题外话
当然你也可以选择静态用scrapy,动态用selenium。这些都是爬虫的一些框架和包。
六、静态爬取中国新闻网
由于实验需要,特爬取中国新闻网滚动新闻的财经新闻发现用"lxml"解析爬不到的用"html.parser"可以爬的到需要爬取中国新闻网其他频道的新闻修改相关参数即可因为流程简单,并未制作严谨的异常处理
# -*- coding: utf-8 -*-
# encoding='utf-8'
import re
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from urllib import request
import datetime
# 用request和BeautifulSoup处理网页
def requestOver(url):
response = (url)
response.encoding = 'utf-8'
soup = BeautifulSoup(, 'html.parser')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = ('div', class_="left_zw")
if(tag == None):
return 0
# print(type(tag))
# print(tag.get_text())
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\eco\\' + title + '.txt'
with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
# 爬虫具体执行过程
def crawlAll(url, y):
soup = requestOver(url)
for s in All("div", class_="content_list"):
for tag in s.findAll("li"):
sp = tag.findAll("a")
if("财经" in str(sp)):
title = list(sp)[1].string
urlAll = "" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
try:
download(title, urlAll, y)
except Exception:
print("第" + str(y) + "个新闻爬取失败")
else:
y += 1
return y
if __name__ == '__main__':
y = 1
url1 = ""
date = "2020/1112"
url2 = "/news.shtml"
for i in range(3650):
date1 = datetime.datetime.strptime(date, "%Y/%m%d")
date2 = datetime.timedelta(days=-1)
date = (date1 + date2).strftime("%Y/%m%d")
target_url = url1 + date + url2
print(target_url)
y = crawlAll(target_url, y)
原文地址:https://blog.csdn.net/weixin_44485744/article/details/109563474