爬虫各种问题

1.request.get()正常返回200但是text无内容
添加浏览器的headers

import requests
import lxml
# 获取源码
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
html = requests.get("https://blog.csdn.net/IT_XF/article/details/82184585",headers = headers)
# 中文乱码解决方法
html.encoding='utf-8'
print(html.text)


etree_html = etree.HTML(html.text)
content = etree_html.xpath('//*[@id="floor-user-profile_485"]/div/div[2]/div/div[2]/div/div[2]/div/div/div/article/a/div[1]/h4/text()')
for each in content:
    print(each)

2.爬取表格数据

import pandas as pd
import csv
from multiprocessing import Pool
import time
 
    
def getdata(url):
    tb = pd.read_html(url)[3] #经观察发现所需表格是网页中第4个表格,故为[3]
    tb.to_csv(r'E:\stock.csv', mode='a', encoding='utf_8_sig', header=0, index=0)
    time.sleep(0.5)
    #print('第'+str(i)+'页抓取完成')
 
#引入进程池
def myprocesspool(num=10):
    pool = Pool(num)
    results = pool.map(getdata,urls)
    pool.close()
    pool.join()
    return results
 
if __name__=='__main__':
    urls=[]
    for i in range(1,179):  # 爬取全部178页数据
        tmp = 'http://s.askci.com/stock/a/?reportTime=2017-12-31&pageNum=%s' % (str(i))
        urls.append(tmp)
        #预先输入好表格的列名
    with open(r'E:\stock.csv', 'w', encoding='utf-8-sig', newline='') as f:
        csv.writer(f).writerow(['序号', '股票代码', '股票简称', '公司名称', '省份', '城市' ,
              '主营业务收入','净利润','员工人数','上市日期','招股书','财报','行业分类','产品类型','主营业务'])
    myprocesspool(10)

3.selenium解决有些反爬问题

# encoding: utf-8
'''
  @author 李华鑫
  @create 2020-10-09 11:34
  Mycsdn:https://buwenbuhuo.blog.csdn.net/
  @contact: 459804692@qq.com
  @software: Pycharm
  @file: 豆瓣图书.py
  @Version:1.0
  
'''
from selenium import webdriver
from lxml import etree
import os
import time
import requests
import re
import csv

start_url = "https://book.douban.com/subject_search?search_text=python&cat=1001&start=%25s0"

# 控制chrome浏览器
driver = webdriver.Chrome("./chromedriver/chromedriver.exe")
# 输入网址
driver.get(start_url)
while True:
    # 停一下,等待加载完毕
    time.sleep(2)
    # 获取网页内容Elements
    content = driver.page_source
    # 提取数据
    data_list = etree.HTML(content).xpath('//div[@class="item-root"]')[1:]
    for data in data_list:
        item = {}
        item["name"] = data.xpath("./div/div[1]/a/text()")[0]
        item["score"] = data.xpath("./div/div[2]/span[2]/text()")[0]
        with open("./豆瓣图书.csv", "a", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(item.values())
        print(item)
    # 找到后页
    next = driver.find_element_by_xpath('//a[contains(text(),"后页")]')
    # 判断
    if next.get_attribute("href"):
        # 单击
        next.click()
    else:
        # 跳出循环
        break
# 结束
driver.quit()



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值