可以爬百度热搜信息
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
def download_content(url):
http = urllib3.PoolManager()
response = http.request("GET", url)
response_data = response.data
html_content = response_data.decode()
return html_content
def save_to_file(filename, content):
fo = open(filename, "w", encoding="utf-8")
fo.write(content)
fo.close()
def create_doc_from_filename(filename):
fo = open(filename, "r", encoding='utf-8')
html_content = fo.read()
fo.close()
doc = BeautifulSoup(html_content, "lxml")
return doc
import re
def parse_html(doc):
images = doc.find_all("span")
for i in images :
if i.attrs.get("class") == ['title-content-title']:
print(re.findall(r"<span class=\"title-content-title\">(.+?)</span>",str(i)))
def main():
filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
url = "www.baidu.com"
result = download_content(url)
save_to_file(filename, result)
doc = create_doc_from_filename(filename)
print(doc)
parse_html(doc)
if __name__ == '__main__':
main()
main()
可以爬图片
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
def download_content(url):
http = urllib3.PoolManager()
response = http.request("GET", url)
response_data = response.data
html_content = response_data.decode()
return html_content
def save_to_file(filename, content):
fo = open(filename, "w", encoding="utf-8")
fo.write(content)
fo.close()
def create_doc_from_filename(filename):
fo = open(filename, "r", encoding='utf-8')
html_content = fo.read()
fo.close()
doc = BeautifulSoup(html_content, "lxml")
return doc
def parse_html(doc):
images = doc.find_all("img")
for i in images:
src = i["src"]
filename = src.split("/")[-1]
if src[0:5] == "https" or src[0:4] == "http":
print(i["src"])
urlretrieve(src, "C:/Users/yangyunpeng/Desktop/" + filename)
else:
print("https:"+i["src"])
urlretrieve("https:"+src, "C:/Users/yangyunpeng/Desktop/" + filename)
def main():
filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
url = "www.baidu.com"
result = download_content(url)
save_to_file(filename, result)
doc = create_doc_from_filename(filename)
parse_html(doc)
if __name__ == '__main__':
main()
爬天眼查但是批量爬被限制
import urllib3
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
def download_content(url):
http = urllib3.PoolManager()
response = http.request("GET", url)
response_data = response.data
html_content = response_data.decode()
return html_content
def save_to_file(filename, content):
fo = open(filename, "w", encoding="utf-8")
fo.write(content)
fo.close()
def create_doc_from_filename(filename):
fo = open(filename, "r", encoding='utf-8')
html_content = fo.read()
fo.close()
doc = BeautifulSoup(html_content, "lxml")
return doc
import re
def parse_html(doc,url):
images = doc.find_all("title")
print(images)
tol = []
for i in images:
if re.findall(r"<title>(.+?) - 天眼查", str(i)) !=None:
a=(str(re.findall(r"<title>(.+?) - 天眼查", str(i))) +str(url))
tol.append(a)
return tol
'''
def main():
x = 27095734
i = 0
for i in range(5):
i = i+1
x = x+1
filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
#url = "https://www.tianyancha.com/company/27095734"
url = ("https://www.tianyancha.com/company/" + str(x))
result = download_content(url)
save_to_file(filename, result)
doc = create_doc_from_filename(filename)
#print(doc)
parse_html(doc)
print(parse_html(doc))
'''
from threading import Timer
def main():
x = 27095734
i = 0
for i in range(5):
i = i + 1
x = x + 1
filename = "C:/Users/yangyunpeng/Desktop/tips3.html"
url = ("https://www.tianyancha.com/company/" + str(x))
print(url)
result = download_content(url)
save_to_file(filename, result)
doc = create_doc_from_filename(filename)
parse_html(doc, url)
if __name__ == '__main__':
main()