1.request.get()正常返回200但是text无内容
添加浏览器的headers
import requests
import lxml
# 获取源码
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
html = requests.get("https://blog.csdn.net/IT_XF/article/details/82184585",headers = headers)
# 中文乱码解决方法
html.encoding='utf-8'
print(html.text)
etree_html = etree.HTML(html.text)
content = etree_html.xpath('//*[@id="floor-user-profile_485"]/div/div[2]/div/div[2]/div/div[2]/div/div/div/article/a/div[1]/h4/text()')
for each in content:
print(each)
2.爬取表格数据
import pandas as pd
import csv
from multiprocessing import Pool
import time
def getdata(url):
tb = pd.read_html(url)[3] #经观察发现所需表格是网页中第4个表格,故为[3]
tb.to_csv(r'E:\stock.csv', mode='a', encoding='utf_8_sig', header=0, index=0)
time.sleep(0.5)
#print('第'+str(i)+'页抓取完成')
#引入进程池
def myprocesspool(num=10):
pool = Pool(num)
results = pool.map(getdata,urls)
pool.close()
pool.join()
return results
if __name__=='__main__':
urls=[]
for i in range(1,179): # 爬取全部178页数据
tmp = 'http://s.askci.com/stock/a/?reportTime=2017-12-31&pageNum=%s' % (str(i))
urls.append(tmp)
#预先输入好表格的列名
with open(r'E:\stock.csv', 'w', encoding='utf-8-sig', newline='') as f:
csv.writer(f).writerow(['序号', '股票代码', '股票简称', '公司名称', '省份', '城市' ,
'主营业务收入','净利润','员工人数','上市日期','招股书','财报','行业分类','产品类型','主营业务'])
myprocesspool(10)
# encoding: utf-8
'''
@author 李华鑫
@create 2020-10-09 11:34
Mycsdn:https://buwenbuhuo.blog.csdn.net/
@contact: 459804692@qq.com
@software: Pycharm
@file: 豆瓣图书.py
@Version:1.0
'''
from selenium import webdriver
from lxml import etree
import os
import time
import requests
import re
import csv
start_url = "https://book.douban.com/subject_search?search_text=python&cat=1001&start=%25s0"
# 控制chrome浏览器
driver = webdriver.Chrome("./chromedriver/chromedriver.exe")
# 输入网址
driver.get(start_url)
while True:
# 停一下,等待加载完毕
time.sleep(2)
# 获取网页内容Elements
content = driver.page_source
# 提取数据
data_list = etree.HTML(content).xpath('//div[@class="item-root"]')[1:]
for data in data_list:
item = {}
item["name"] = data.xpath("./div/div[1]/a/text()")[0]
item["score"] = data.xpath("./div/div[2]/span[2]/text()")[0]
with open("./豆瓣图书.csv", "a", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(item.values())
print(item)
# 找到后页
next = driver.find_element_by_xpath('//a[contains(text(),"后页")]')
# 判断
if next.get_attribute("href"):
# 单击
next.click()
else:
# 跳出循环
break
# 结束
driver.quit()