#赶集网首页静态页面的链接爬取from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://ty.ganji.com/")
bsObj=BeautifulSoup(html)
for link in bsObj.find("div",{"class":"content-col"}).findAll("a",href=re.compile("^[a-z]")):
if'href'in link.attrs:
print("http://ty.ganji.com/"+link.attrs['href'])
#出现404HTTPError,修改请求头访问from bs4 import BeautifulSoup
import requests
import re
session=requests.session()
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
"Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8"}
#def getlinks(articleUrl):
url="http://ty.ganji.com/fang1/"
req=session.get(url,headers=headers)
bsObj=BeautifulSoup(req.text)
for link in bsObj.find("div",{"class":"f-list js-tips-list"}).findAll("a",href=re.compile("^\/")):
if'href'in link.attrs:
print("http://ty.ganji.com"+link.attrs['href'])