2021-11-07

最新推荐文章于 2024-08-16 18:55:06 发布

lichunbao_1

最新推荐文章于 2024-08-16 18:55:06 发布

阅读量50

点赞数

文章标签： python 开发语言后端爬虫

本文链接：https://blog.csdn.net/lichunbao_1/article/details/121193179

版权

抓取网页上的数据，出现任何bug，自动记录爬取位置。

# -*- coding: utf-8 -*-
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
import time
import random
req_header = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36'}
contact_links = []
# #获取网页上所有超链接并存入文本文件
# session = HTMLSession()
# firstpage_links = []
# url = 'https://XXXXX.com'
# def get_alllinks(url):
#     r = session.get(url)
#     return r.html.absolute_links
# firstpage_links.extend(get_alllinks(url))
# for url in firstpage_links:
#     for url_next in get_alllinks(url):
#         if 'weixin/why' in url_next or 'SrcShow.asp?Src_ID=' in url_next:
#             contact_links.append(url_next)
# with open('网址.txt','a',encoding='utf-8') as f:
#     for i in contact_links:
#         f.write(i+'\n')

#获取手机标签文本
def get_idtext(url):
    time.sleep(random.randint(0,10))
    html = requests.get(url,headers=req_header)
    etree_html = etree.HTML(html.content,parser=etree.HTMLParser(encoding='gbk'))
    result_phone = etree_html.xpath('/html/body/div[6]/div[1]/div[2]/div[2]/p[2]/text()[2]') #取手机号
    result_weixin = etree_html.xpath('/html/body/div[6]/div[1]/div[2]/div[2]/p[2]/text()[4]') #取微信号
    if result_phone:
        if result_phone[0] not in contact_links:
            contact_links.append(result_phone[0])
    if result_weixin:
        if result_weixin[0] not in contact_links:
            contact_links.append(result_weixin[0])

#获取微信标签文本
def get_wxtext(url):
    time.sleep(random.randint(0,10))
    html = requests.get(url,headers=req_header)
    etree_html = etree.HTML(html.content,parser=etree.HTMLParser(encoding='gbk'))
    result_phone = etree_html.xpath('/html/body/div[6]/div[1]/div[3]/div[2]/p[3]/text()[2]') #取手机号
    result_weixin = etree_html.xpath('/html/body/div[6]/div[1]/div[3]/div[2]/p[3]/text()[4]') #取微信号
    if result_phone:
        if result_phone[0] not in contact_links:
            contact_links.append(result_phone[0])
    if result_weixin:
        if result_weixin[0] not in contact_links:
            contact_links.append(result_weixin[0])
number = 0
save_number = 0
last_number = 0
for a in open('last_number.txt','r',encoding='utf-8'):
    last_number = a
#爬取网页上的手机号和微信号并存入文本文件
for url in open('爬取网址今日任务.txt','r',encoding='utf-8'):
    try:
        number+=1
        if number > int(last_number):
            print(f'从第{number}行开始加')
            if 'weixin/why' in url:
                get_wxtext(url.rstrip("\n"))
                save_number+=1
            elif 'SrcShow.asp?Src_ID=' in url:
                get_idtext(url.rstrip("\n"))
                save_number+=1
    except:
        print('出现问题中断')
        with open('last_number.txt','w',encoding='utf-8') as f:
            f.write(str(int(last_number)+save_number))
                
print(len(contact_links))

with open('微信手机.txt','a',encoding='utf-8') as f:
    for i in contact_links:
        f.write(i+'\n')
contact_links = []

lichunbao_1

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2021-11-07

抓取网页上的数据，出现任何bug，自动记录爬取位置。# -*- coding: utf-8 -*-import requestsfrom requests_html import HTMLSessionfrom bs4 import BeautifulSoupfrom lxml import etreeimport timeimport randomreq_header = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Ap
复制链接

扫一扫