import os from bs4 import BeautifulSoup from urllib import request import urllib import time import codecs import requests import random USER_AGENT = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:55.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:55.0) Gecko/20100101 Firefox/53.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:55.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:55.0) Gecko/20100101 Firefox/50.0", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2306.400 QQBrowser/9.5.10648.400", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] def new_txt(soup): g=0 alt=soup.find("div","navi-brand").find_next("img").attrs["alt"] print('alt',alt) newtxt = os.getcwd()+"\\"+alt+"\\" if not os.path.exists(newtxt): os.makedirs(newtxt) for file in soup.find_all('a', class_="ma_h1"): ur = file.get("href") print('ur',ur) QCCUR='https://www.qichacha.com' + ur print(str(ur) + '.txt') g += 1 res = urllib.request.Request(QCCUR, headers={'User-Agent': USER_AGENT[random.randint(0, len(USER_AGENT) - 1)]}) urlsleep = random.randint(10, 20) print('sleeptime', urlsleep) time.sleep(urlsleep) res = urllib.request.urlopen(res) soup = BeautifulSoup(res, "html.parser") print('soup', soup) wangzhi=soup.find_all("html")[0] print("wangzhi",wangzhi) urtx=str(ur).replace('/', ' ').lstrip('?') print(urtx) with codecs.open(newtxt + str(urtx)+ '.txt',"w",encoding="UTF-8")as f: f.write(str(wangzhi)) def xiayiye(url): res = urllib.request.Request(url, headers={'User-Agent': USER_AGENT[random.randint(0, len(USER_AGENT) - 1)]}) res = urllib.request.urlopen(res) # res是网页的源代码,然后使用bs4的html进行解析 soup = BeautifulSoup(res, "html.parser",from_encoding="utf8") cc=new_txt(soup) # 爬取公司名 print('cc',cc) for ne in soup.find_all('a', 'next'): next_page_url = ne.get("href") ne = 'https://www.qichacha.com' + next_page_url print('ne',ne) sleeptime = random.randint(10,25) print('sleeptime',sleeptime) time.sleep(sleeptime) xiayiye(ne) if __name__ == '__main__': xiayiye("https://www.qichacha.com/g_CQ.html")
bs4爬取企查查信息,时间问题一个页面创建一个txt文件,没有写保存数据库的方法
最新推荐文章于 2023-10-18 11:34:18 发布