Python爬虫实战案例-爬取币世界标红快讯-CSDN博客

爬取币世界标红快讯内容(移动版)

# 引入依赖
from lxml import etree
import requests
import pymongo
import time
client = pymongo.MongoClient('写你自己的数据库地址', 27017) # 需要自己安装mongodb客户端
mydb = client['mydb']
information = mydb['information'] # 数据库表名
currentTime = time.strftime("%m%d%H", time.localtime())
saveTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

# 伪造成手机
header = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}

def get_url(url):
    html = requests.get(url, headers=header)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')
    onlyOne = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')[0]
    saveId = onlyOne.xpath('../@id')[0]
    file = open(r'C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'w') # 写你自己的文件地址
    file.write(currentTime +' '+saveId)
    file.close()
    for info in infos:
        try:
            title = (info.xpath('h3[@class="text_title"]/text()')[0]).strip()
            content = (info.xpath('p[@class="text_show"]/text()')[0]).strip()
            date = info.xpath('../h3[@class="timenode"]/text()')[0]
            infoId = info.xpath('../@id')[0]

            data = {
                'title': title,
                'id': infoId,
                'date': saveTime,
                'content': content,
                'source': 'bishijie'
            }

            print(data)

            if (int(infoId) > int(saveId) - 20):
                print('插入了一条新数据!')
                information.insert_one(data)
            else:
                print('无新数据产生!')

        except IndexError:
            pass

if __name__ == '__main__':
    fs = open('C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'r+') # 写你自己的文件地址
    line = fs.read()
    fileDate = line[0:6]

    if (fileDate != currentTime):
        print('时间不一致,宕机使用当前系统时间进行爬取!')
        urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime]
        for url in urls:
            get_url(url)
            time.sleep(2)
    else:
        print('时间一致, 正常运行!')
        urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime]
        for url in urls:
            get_url(url)
            time.sleep(2)
            
复制代码