北京理工大学-Python网络爬虫与信息提取学习笔记08-CSDN博客

本文链接：https://blog.csdn.net/I_HAVE_COME/article/details/105539035

两个实例仅供参考，不具有实用作用。因为网页都需要登录。

淘宝商品定向爬虫.py

#获取淘宝搜索页面信息，提取其中的商品名称和价格
import requests
import re

def getHtMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt,htlm):
    try:
        plt=re.findall(r'\"realPrice\"\:\[\d\.]*\"',htlm)
        tlt=re.findall(r'/span&gt\:\".*?\"',htlm)
        for i in range(len(plt)):
            price=eval(plt[i].split(':')[1])
            title=eval(tlt[i].split(':')[1])
            ilt.append(price,title)
    except:
        print("")

def printGoodsList(ilt):
    tplt="{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count=0
    for g in ilt:
        print(tplt.format(count,g[0],g[1]))

def main():
    goods='牛仔裤'
    depth=2
    star_url='https://ai.taobao.com/search/index.htm?key='+goods
    infolist=[]
    for i in range(depth):
        try:
            url=star_url+'&s='+str(44*i)
            html=getHtMLText(url)
            parsePage(infolist,html)
        except:
            continue
    printGoodsList(infolist)

main()

股票数据定向爬虫.py

import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHtMLText(url,code='utf-8'):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html=getHtMLText(stockURL)
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
        except:
            continue

def getStockInfo(lst,stockURL,fpath):
    count=0
    for stock in lst:
        url=stockURL+stock+'.html'
        html=getHtMLText(url)
        try:
            if html=="":
                continue
            infoDict={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'stock-bets'})

            name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'股票名称':name.text.split()[0]})

            KeyList=stockInfo.find_all('dt')
            ValueList=stockInfo.find_all('dd')
            for i in range(len(KeyList)):
                key=KeyList[i].text
                value=ValueList[i].text
                infoDict[key]=value

            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
                count=count+1
                print("\r当前速度：{:.2f}%".format(count*100/len(lst)),end="")
        except:
            count = count + 1
            print("\r当前速度：{:.2f}%".format(count * 100 / len(lst)), end="")
            traceback.print_exc()
            continue

def main():
    stock_list_url='http://quote.eastmoney.com/stock.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file='D:/Python/Workspace/爬虫基础/stock.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()