做了个关于取名字的小爬虫

最新推荐文章于 2023-12-25 11:04:31 发布

chikun6792

最新推荐文章于 2023-12-25 11:04:31 发布

阅读量375

点赞数

文章标签：爬虫 python 操作系统

原文链接：https://my.oschina.net/u/3317385/blog/888501

版权

背景：家里有个亲戚生了对龙凤胎，取名字很头痛。他们要求名字的第三个字要是五行属土的字，而且两个孩子名字的第三个字能组成一个寓意不错的词。

于是我写了个小程序，突然觉得python真的很强大。哈哈哈。

1.先网上把五行属土的字放到txt文件中，文件格式是自己处理好了，内容都是以逗号格分。

2.使用文件中的字循环放进一个组词网进行组词，然后组词的字都要判断是否也是属土的字。生成一个词典文件。

3.使用生成的词典循环放进百度词典网站查找释义，生成词典释义文件。

4.手动选择释义较好的词放进测评网站测名字分数。

代码很粗糙，但觉得好好玩。

searchzuci.py

#-*- coding: utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time,os
from searchmodal import *
import threading

def catch_web(url,infile,outfile,items):
allitem =[]
driver = conectweb(url)
nameitem = readfile(infile)
nameitempop = items
for i in nameitem:
G = i
try:
driver.find_element_by_xpath(".//*[@id='ss_tj_value_1']").clear()
driver.find_element_by_xpath(".//*[@id='ss_tj_value_1']").send_keys(G)
driver.find_element_by_xpath(".//*[@id='main_content']/div[2]/form/input[2]").click()
time.sleep(1)
except NoSuchElementException:
print(G+"is faile")

now_url1 = driver.current_url
url1 = getHtml(now_url1)
soup = BeautifulSoup(url1, 'html.parser' )
tagsdiv = soup.find_all('ul',attrs={'class':'list_2'})

item=[]
for tag in tagsdiv:
itemli = tag.find_all('li')
for it in itemli:
item.append(it.get_text())

newitem=[]
for x in range(len(item)):
new_str = item[x]
if len(new_str)==2:
new_str1 = new_str.strip(G)
if new_str1 in nameitempop:
newitem.append(new_str)
nameitempop.pop(nameitempop.index(G))
allitem.append(newitem)

file_out = open(outfile,'w')
for value in allitem:
for y in value:
file_out.write(str(y))
file_out.write(',')
file_out.close()

url = 'http://zuci.51240.com/'
in_filename = "name.txt"
in_filename1 ="name1.txt"
out_filename="allname.txt"
out_filename1 ="allname1.txt"

nameitempop = readfile("800tu.txt")

starttime = time.strftime('%Y-%m-%d %X',time.localtime())
print(starttime)

threads=[]

t1 = threading.Thread(target=catch_web,args=(url,in_filename,out_filename,nameitempop))
threads.append(t1)

#t2 = threading.Thread(target=catch_web,args=(url,in_filename1,out_filename1,nameitempop))
#threads.append(t2)

for t in threads:
t.start()
for t in threads:
t.join()

finaltime = time.strftime('%Y-%m-%d %X',time.localtime())
print(finaltime)

serachciyi.py

#-*- coding: utf-8 -*-
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import threading
import multiprocessing

def readfile(filename):
file = open(filename,'r')
lines = file.readlines()
file.close()
str_list=[]
for line in lines:
str_list=line.split(',')
return str_list

def contributpjs():
#使用DesiredCapabilities模块来伪装userAgent
drap = dict(DesiredCapabilities.PHANTOMJS)
#设置要伪装的userAgent
drap["phantomjs.page.settings.userAgent"]=("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 ")
#提高性能
server_args=[]
server_args.append('--load-images=no') #关闭图片加载
server_args.append('--disk-cache=yes') #开启缓存
server_args.append('--ignore-ssl-errors=ture') #忽略https错误
browser = webdriver.PhantomJS(executable_path='C:/Users/Administrator/AppData/Local/Programs/Python/Python36/phantomjs.exe',desired_capabilities=drap)

return browser
def catchweb(infile,url,outfile):
# drap = dict(DesiredCapabilities.PHANTOMJS)
# drap["phantomjs.page.settings.userAgent"]=("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 ")
# driver = webdriver.PhantomJS(executable_path='C:/Users/Administrator/AppData/Local/Programs/Python/Python36/phantomjs.exe',desired_capabilities=drap)
driver = contributpjs()

list = readfile(infile)
allitem={}

for i in list:
try:
driver.get(url)
driver.implicitly_wait(5)
#driver.set_page_load_timeout(10)
driver.find_element_by_xpath(".//*[@id='kw']").clear()
driver.find_element_by_xpath(".//*[@id='kw']").send_keys(i)
driver.find_element_by_xpath(".//*[@id='su']").click()
time.sleep(1)
print(i)
stringtext=driver.find_element_by_xpath(".//*[@id='detailmean-wrapper']/div[1]/dl/dd/ol/li[1]/p[1]").text
allitem[i]=stringtext

except NoSuchElementException:
print(i+' is faile')

driver.quit()
file_out = open(outfile,'w')
for key,value in allitem.items():
file_out.write(key)
file_out.write(':')
file_out.write(value)
file_out.write('\n')
file_out.close()

url = 'http://dict.baidu.com/'
filename={"aname.txt":"ziyi.txt","aname1.txt":"ziyi1.txt","aname2.txt":"ziyi2.txt","aname3.txt":"ziyi3.txt","aname4.txt":"ziyi4.txt"}
threads=[]

starttime = time.strftime('%Y-%m-%d %X',time.localtime())
print(starttime)

if __name__ =='__main__':
pool = multiprocessing.Pool(processes=5)
for x,y in filename.items():
pool.apply_async(catchweb,(x,url,y))
print(x +" is start")
pool.close()
pool.join()
# for res in threads:
# print(res.get())

finaltime = time.strftime('%Y-%m-%d %X',time.localtime())
print(finaltime)

转载于:https://my.oschina.net/u/3317385/blog/888501