2021-11-07

抓取网页上的数据,出现任何bug,自动记录爬取位置。

# -*- coding: utf-8 -*-
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
import time
import random
req_header = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36'}
contact_links = []
# #获取网页上所有超链接并存入文本文件
# session = HTMLSession()
# firstpage_links = []
# url = 'https://XXXXX.com'
# def get_alllinks(url):
#     r = session.get(url)
#     return r.html.absolute_links
# firstpage_links.extend(get_alllinks(url))
# for url in firstpage_links:
#     for url_next in get_alllinks(url):
#         if 'weixin/why' in url_next or 'SrcShow.asp?Src_ID=' in url_next:
#             contact_links.append(url_next)
# with open('网址.txt','a',encoding='utf-8') as f:
#     for i in contact_links:
#         f.write(i+'\n')

#获取手机标签文本
def get_idtext(url):
    time.sleep(random.randint(0,10))
    html = requests.get(url,headers=req_header)
    etree_html = etree.HTML(html.content,parser=etree.HTMLParser(encoding='gbk'))
    result_phone = etree_html.xpath('/html/body/div[6]/div[1]/div[2]/div[2]/p[2]/text()[2]') #取手机号
    result_weixin = etree_html.xpath('/html/body/div[6]/div[1]/div[2]/div[2]/p[2]/text()[4]') #取微信号
    if result_phone:
        if result_phone[0] not in contact_links:
            contact_links.append(result_phone[0])
    if result_weixin:
        if result_weixin[0] not in contact_links:
            contact_links.append(result_weixin[0])

#获取微信标签文本
def get_wxtext(url):
    time.sleep(random.randint(0,10))
    html = requests.get(url,headers=req_header)
    etree_html = etree.HTML(html.content,parser=etree.HTMLParser(encoding='gbk'))
    result_phone = etree_html.xpath('/html/body/div[6]/div[1]/div[3]/div[2]/p[3]/text()[2]') #取手机号
    result_weixin = etree_html.xpath('/html/body/div[6]/div[1]/div[3]/div[2]/p[3]/text()[4]') #取微信号
    if result_phone:
        if result_phone[0] not in contact_links:
            contact_links.append(result_phone[0])
    if result_weixin:
        if result_weixin[0] not in contact_links:
            contact_links.append(result_weixin[0])
number = 0
save_number = 0
last_number = 0
for a in open('last_number.txt','r',encoding='utf-8'):
    last_number = a
#爬取网页上的手机号和微信号并存入文本文件
for url in open('爬取网址今日任务.txt','r',encoding='utf-8'):
    try:
        number+=1
        if number > int(last_number):
            print(f'从第{number}行开始加')
            if 'weixin/why' in url:
                get_wxtext(url.rstrip("\n"))
                save_number+=1
            elif 'SrcShow.asp?Src_ID=' in url:
                get_idtext(url.rstrip("\n"))
                save_number+=1
    except:
        print('出现问题中断')
        with open('last_number.txt','w',encoding='utf-8') as f:
            f.write(str(int(last_number)+save_number))
                
print(len(contact_links))

with open('微信手机.txt','a',encoding='utf-8') as f:
    for i in contact_links:
        f.write(i+'\n')
contact_links = []
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值