仅供参考,以学习为主
一、用selenium获取用户的个人连接地址,并保存到excel中
脚本名:url.py
import requests
from bs4 import BeautifulSoup
import chardet
import random
import openpyxl
from openpyxl import load_workbook
import re
from selenium import webdriver
import time
# 新建工作表
wb = load_workbook('data/世纪佳缘.xlsx')
ws = wb.sheetnames
sheet = wb[ws[0]]
sheet.append(['http'])
# 模拟登录
diver = webdriver.Chrome()
diver.maximize_window()
url = "http://search.jiayuan.com/v2/index.php?key=&sex=f&stc=2:19.29&sn=default&sv=1&p=1&pt=21234&ft=off&f=select&mt=d"
diver.get(url)
time.sleep(5)
user = diver.find_element_by_id('login_email_new')
user.send_keys('写入用户名')
time.sleep(1)
passwd = diver.find_element_by_id('login_password_new')
passwd.send_keys('写入密码')
time.sleep(1)
login = diver.find_elements_by_class_name('login_btn')[1]
login.click()
time.sleep(1)
#总共爬取500页
for i in range(1,501):
print("****************正在爬取{}页**************".format(i))
#这是爬取男生的信息
url = "http://search.jiayuan.com/v2/index.php?key=&sex=m&stc=2:19.29&sn=default&sv=1&p={}&pt=21234&ft=off&f=select&mt=d".format(i)
#这是爬取女生的信息(每次只能爬取一种)
#url = "http://search.jiayuan.com/v2/index.php?key=&sex=m&stc=2:19.29&sn=default&sv=1&p={}&pt=21234&ft=off&f=select&mt=d".format(i)
diver.get(url)
time.sleep(1)
parent = diver.find_element_by_id('normal_user_container').find_elements_by_tag_name('li')
time.sleep(1)
for each in parent:
data = []
data.append(each.find_elements_by_class_name('user_name')[0].find_elements_by_tag_name('a')[0].get_attribute('href'))
sheet.append(data)
wb.save('data/世纪佳缘.xlsx')
二、读取上个url,别爬取每个人的具体信息
脚本名为:person.py
from pandas import DataFrame, Series
import pandas as pd
from openpyxl import load_workbook
import re
from selenium import webdriver
import time
#读取上一个excel,并把数据放到df里面
df = pd.read_excel('data/世纪佳缘.xlsx')
# 新建工作表
wb = load_workbook('data/世纪佳缘详细.xlsx')
ws = wb.sheetnames
sheet = wb[ws[0]]
sheet.append(
['name_id', 'age_form', 'education', 'hight', 'car', 'salay', 'house', 'weight', 'xingzuo', 'shuxing', 'blood',
'sex', 'an_age', 'an_hight', 'an_education', 'an_adress', 'introduce'])
diver = webdriver.Chrome()
diver.maximize_window()
url = "http://search.jiayuan.com/v2/index.php?key=&sex=f&stc=2:19.29&sn=default&sv=1&p=1&pt=21234&ft=off&f=select&mt=d"
diver.get(url)
time.sleep(5)
user = diver.find_element_by_id('login_email_new')
user.send_keys('写入用户名')
time.sleep(1)
passwd = diver.find_element_by_id('login_password_new')
passwd.send_keys('写入密码')
time.sleep(1)
login = diver.find_elements_by_class_name('login_btn')[1]
login.click()
# 循环爬取链接
def login(i):
for each in df['http'][i:]:
print("正在爬取第{}个连接".format(i), each)
getData(each, i)
i += 1
return diver
def getData(url, i):
datalist = []
diver.get(url)
time.sleep(1)
try:
name_id = diver.find_element_by_class_name('member_info_r').find_element_by_tag_name('h4').text
datalist.append(name_id)
age_form = diver.find_element_by_class_name('member_name').text
datalist.append(age_form)
parent = diver.find_elements_by_class_name('member_info_list')
for each in parent:
education = each.find_elements_by_tag_name('li')[0].find_elements_by_tag_name('div')[1].text
datalist.append(education)
hight = each.find_elements_by_tag_name('li')[1].find_elements_by_tag_name('div')[1].text
datalist.append(hight)
car = each.find_elements_by_tag_name('li')[2].find_elements_by_tag_name('div')[1].text
datalist.append(car)
salay = each.find_elements_by_tag_name('li')[3].find_elements_by_tag_name('div')[1].text
datalist.append(salay)
house = each.find_elements_by_tag_name('li')[4].find_elements_by_tag_name('div')[1].text
datalist.append(house)
weight = each.find_elements_by_tag_name('li')[5].find_elements_by_tag_name('div')[1].text
datalist.append(weight)
xingzuo = each.find_elements_by_tag_name('li')[6].find_elements_by_tag_name('div')[1].text
datalist.append(xingzuo)
shuxing = each.find_elements_by_tag_name('li')[8].find_elements_by_tag_name('div')[1].text
datalist.append(shuxing)
blood = each.find_elements_by_tag_name('li')[9].find_elements_by_tag_name('div')[1].text
datalist.append(blood)
introduce = diver.find_elements_by_class_name('js_text')[0].text
sex = diver.find_elements_by_class_name('js_box')[2].find_elements_by_tag_name('h4')[0].text
datalist.append(sex)
parent2 = diver.find_elements_by_class_name('js_box')[2].find_elements_by_tag_name('ul')
for each in parent2:
an_age = each.find_elements_by_class_name('ifno_r_con')[0].text
datalist.append(an_age)
an_hight = each.find_elements_by_class_name('ifno_r_con')[1].text
datalist.append(an_hight)
an_education = each.find_elements_by_class_name('ifno_r_con')[3].text
datalist.append(an_education)
an_adress = each.find_elements_by_class_name('ifno_r_con')[6].text
datalist.append(an_adress)
datalist.append(introduce)
sheet.append(datalist)
wb.save('data/世纪佳缘详细.xlsx')
return
except:
login(i + 1)
return
#调用函数,执行脚本
login(1)