python爬虫代码

可以爬百度热搜信息

# -*- coding: utf-8 -*-
# @Author: AI悦创
# @Date:   2021-09-13 20:16:07
# @Last Modified by:   aiyc
# @Last Modified time: 2021-09-14 08:53:23
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

# 第一个函数,用来下载网页,返回网页内容
# 参数 url 代表所要下载的网页网址。
# 整体代码和之前类似
def download_content(url):
	http = urllib3.PoolManager()
	response = http.request("GET", url)
	response_data = response.data
	html_content = response_data.decode()
	return html_content
# 第二个函数,将字符串内容保存到文件中
# 第一个参数为所要保存的文件名,第二个参数为要保存的字符串内容的变量

def save_to_file(filename, content):
	fo = open(filename, "w", encoding="utf-8")
	fo.write(content)
	fo.close()


# 输入参数为要分析的 html 文件名,返回值为对应的 BeautifulSoup 对象
def create_doc_from_filename(filename):
	fo = open(filename, "r", encoding='utf-8')
	html_content = fo.read()
	fo.close()
	doc = BeautifulSoup(html_content, "lxml")
	return doc
import re
def parse_html(doc):
	images = doc.find_all("span")
	#print(images)

	for i in images :
		#print(i)
		#print(i.attrs.get("class")) #字典["class"]如果不存在会报错,字典.get("class")则会返回空
		if i.attrs.get("class") == ['title-content-title']:
			#print(i)
			#print(type(i))
			#print(i.has_attr('class')) #判断是否有参数class
			print(re.findall(r"<span class=\"title-content-title\">(.+?)</span>",str(i)))

def main():
	filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
	url = "www.baidu.com"
	result = download_content(url)
	save_to_file(filename, result)
	doc = create_doc_from_filename(filename)
	print(doc)
	parse_html(doc)

if __name__ == '__main__':
	main()
main()

可以爬图片

# -*- coding: utf-8 -*-
# @Author: AI悦创
# @Date:   2021-09-13 20:16:07
# @Last Modified by:   aiyc
# @Last Modified time: 2021-09-14 08:53:23
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

# 第一个函数,用来下载网页,返回网页内容
# 参数 url 代表所要下载的网页网址。
# 整体代码和之前类似
def download_content(url):
	http = urllib3.PoolManager()
	response = http.request("GET", url)
	response_data = response.data
	html_content = response_data.decode()
	return html_content
# 第二个函数,将字符串内容保存到文件中
# 第一个参数为所要保存的文件名,第二个参数为要保存的字符串内容的变量

def save_to_file(filename, content):
	fo = open(filename, "w", encoding="utf-8")
	fo.write(content)
	fo.close()


# 输入参数为要分析的 html 文件名,返回值为对应的 BeautifulSoup 对象
def create_doc_from_filename(filename):
	fo = open(filename, "r", encoding='utf-8')
	html_content = fo.read()
	fo.close()
	doc = BeautifulSoup(html_content, "lxml")
	return doc

def parse_html(doc):
	images = doc.find_all("img")
	for i in images:
		src = i["src"]
		filename = src.split("/")[-1]
		if src[0:5] == "https" or src[0:4] == "http":
			print(i["src"])
			urlretrieve(src, "C:/Users/yangyunpeng/Desktop/" + filename)
		else:
			print("https:"+i["src"])
			urlretrieve("https:"+src, "C:/Users/yangyunpeng/Desktop/" + filename)

def main():
	filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
	url = "www.baidu.com"
	result = download_content(url)
	save_to_file(filename, result)
	doc = create_doc_from_filename(filename)
	parse_html(doc)

if __name__ == '__main__':
	main()

爬天眼查但是批量爬被限制

# -*- coding: utf-8 -*-
# @Author: AI悦创
# @Date:   2021-09-13 20:16:07
# @Last Modified by:   aiyc
# @Last Modified time: 2021-09-14 08:53:23
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

# 第一个函数,用来下载网页,返回网页内容
# 参数 url 代表所要下载的网页网址。
# 整体代码和之前类似
def download_content(url):
	http = urllib3.PoolManager()
	response = http.request("GET", url)
	response_data = response.data
	html_content = response_data.decode()
	return html_content
# 第二个函数,将字符串内容保存到文件中
# 第一个参数为所要保存的文件名,第二个参数为要保存的字符串内容的变量

def save_to_file(filename, content):
	fo = open(filename, "w", encoding="utf-8")
	fo.write(content)
	fo.close()


# 输入参数为要分析的 html 文件名,返回值为对应的 BeautifulSoup 对象
def create_doc_from_filename(filename):
	fo = open(filename, "r", encoding='utf-8')
	html_content = fo.read()
	fo.close()
	doc = BeautifulSoup(html_content, "lxml")
	return doc


import re
def parse_html(doc,url):
	images = doc.find_all("title")
	print(images)
	tol = []
	for i in images:
		#print(i)
		if re.findall(r"<title>(.+?) - 天眼查", str(i)) !=None:
			a=(str(re.findall(r"<title>(.+?) - 天眼查", str(i))) +str(url))
			#print(a)
			tol.append(a)
	return tol



'''
def main():
	x = 27095734
	i = 0
	for i in range(5):
		i = i+1
		x = x+1
		filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
		#url = "https://www.tianyancha.com/company/27095734"
		url = ("https://www.tianyancha.com/company/" + str(x))
		result = download_content(url)
		save_to_file(filename, result)
		doc = create_doc_from_filename(filename)
		#print(doc)
		parse_html(doc)
		print(parse_html(doc))
'''
from threading import Timer
def main():
	x = 27095734
	i = 0
	for i in range(5):
		i = i + 1
		x = x + 1
		filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
		url = ("https://www.tianyancha.com/company/" + str(x))
		print(url)
		result = download_content(url)
		save_to_file(filename, result)
		doc = create_doc_from_filename(filename)
		#print(doc)
		parse_html(doc, url)
		#print(parse_html(doc,url))

if __name__ == '__main__':
	main()




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值