应用python爬取金点设计奖获奖作品数据的时候,发现无论如何更换协议头,获取的json数据都不会更改,但是手动打开网页json加载的数据会更改,后面想到使用使用session,数据果断出来了!
网站:
goldenpin.org.tw
金点设计奖(Golden Pin Design Award),是台湾创意设计中心执行,在台湾拥有35年历史,是台湾历史最悠久、最权威且最富知名度的专业设计竞赛。2014年首度将报名资格扩大到全球的华人市场(新增中国、中国香港、中国澳门、新加坡、马来西亚五地),参赛厂商超过数千家,报名作品累积上万件。被媒体称为「全球华人市场最顶尖设计奖项」、「设计界的金马奖」 。
目标网址:
http://www.goldenpin.org.tw/金點設計獎/?y=2019
抓包访问数据:
ajax加载分页数据:
json加载html网页数据:
协议头数据都是一致,无论如何访问哪个年份,除了页码数据(分页)不一样
更改协议头,补全Referer,补全协议头都不能获取到不同的数据
后面采用requests session,第一次访问年份获奖数据,再用json获取html数据
requests session的用法
在使用python requests库时遇到一个问题,就是如何在session中设置对所有请求都生效的cookie?requests中的session对象一大特性就是它会自动为你管理cookie,当你登录一个页面时,它可以自动识别response中的set cookie头,然后为下面的请求一直维持这个cookie。
添加cookie有2种方式:
一个是把cookie先写成字典形式,然后把字典转换为cookiejar s = requests.Session() # 开启一个会话Session cookie_dict={'49BAC005-7D5B-4231-8CEA-16939BEACD67': 'cktest001', # 从chrome浏览器中取到的cookie值 'JSESSIONID':'F4FFF69B8XXXXXXC8DCB4C061C0', 'JSESSIONIDSSO':'9D49C76FD6XXXXXF294242B44A' } s.cookies = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True) # 把cookie值转换为cookiejar类型,然后传给Session #注意:这个方法会替换掉原有的cookies
二是追加cookies
s = requests.Session() # 开启一个会话Session jar = requests.cookies.RequestsCookieJar() # 创建一个Cookie Jar对象 jar.set('49BAC005-7D5B-4231-8CEA-1XXXXBEACD67','cktXXXX001') # 向Cookie Jar对象中添加cookie值 jar.set('JSESSIONID','F4FFF69B8CXXXX80F0C8DCB4C061C0') jar.set('JSESSIONIDSSO','9D49C7XXXX448FDF5B0F294242B44A') s.cookies.update(jar) # 把cookies追加到Session中
来实现python金点设计奖数据爬虫
第一步:构建采集url:
def get_url(): urls=[] categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"] years = ["2019", "2018", "2017", "2016", "2015"] for category in categorys: cate_gory=urllib.parse.quote(category) for year in years: url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}" print(url) urls.append(url) print(len(urls)) return urls
第二步:随机协议头
def ua(): ua=UserAgent() headers={"User-Agent":ua.random,} return headers
第三步:requests 访问年份首页,获取requests session
def get_session(furl): s = requests.session() s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) try: fresponse = s.get(furl, headers=ua(), timeout=10) except requests.exceptions.RequestException as e: print(f'访问链接失败,错误代码:{e}') with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl}-访问链接失败,错误代码:{e}\n') print(fresponse.status_code) time.sleep(2) return s
第四步:获取json数据
def get_req(s,i): url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php" data={ 'action': 'presscore_template_ajax', 'postID': '15317', 'paged': '1', 'targetPage': i, 'term':'', 'orderby':'', 'order':'', 'nonce': '1f3d287a9a', 'contentType': 'portfolio', 'pageData[type]': 'page', 'pageData[template]': 'portfolio', 'pageData[layout]': 'masonry', 'sender': 'more', } response=s.post(url,data=data,headers=ua(),timeout=10) print(response.status_code) if response.status_code==200: req=response.content.decode('utf-8') html=json.loads(req)['html'] h3s=re.findall(r'<h3 class="entry-title"><a target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S) print(len(h3s)) for h3 in h3s: title=h3[1] href=h3[0] data=title,href print(data) get_content(href) time.sleep(2)
大奖设计作品数据获取:
def get_content(url): #url="http://www.goldenpin.org.tw/project/ps63/" response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8') time.sleep(2) html=etree.HTML(response) category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0] print(category) year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0] print(year) subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0] print(subclassification) h1=html.xpath('//h1[@class="entry-title"]/text()')[0] h1 = re.sub(r'[\|\/\<\>\:\*\?\'\\"]', "_", h1) # 剔除不合法字符 print(h1) # 获取标题 path=f"{year}/{category}/{subclassification}/" os.makedirs(path,exist_ok=True) #创建目录 descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()') description=''.join(descriptions) texts='%s%s%s'%(h1,'\n',description) print(description) imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src') down(h1,imgs,path,texts)
下载模块:
def down(h1,imgs,path,texts): try: print(f'>>> 开始保存{h1}.txt文本..') with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f: f.write(texts) print(f'>>> 保存{h1}.txt文本成功!') except Exception as e: print(f'获取详情数据失败,错误代码:{e}') with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}\n') i=1 for img in imgs: img_url=img suffix = os.path.splitext(img_url)[1] # 获取后缀名 img_name = '%s%s%d%s' % (h1,'_',i, suffix) print(f'>>> 开始下载{img_name}图片..') s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) try: r = s.get(img_url, timeout=20, headers=ua()) with open(f'{path}/{img_name}', 'wb') as f: f.write(r.content) print(f'>>> 开始下载{img_name}图片完成!') except requests.exceptions.RequestException as e: print(f'下载图片失败,错误代码:{e}') with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}\n') time.sleep(1) i=i+1
最后:创建一个mian()函数来运行爬虫
def main(): urls=get_url() for furl in urls: print(f'>>> 正在抓取 {furl} 链接数据...') try: s=get_session(furl) for i in range(5, 6): try: get_req(s,i) except Exception as e: print(f'获取网页数据失败,错误代码:{e}') with open(f'json_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}\n') except Exception as e: print(f'获取网页数据失败,错误代码:{e}') with open(f'spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl}-获取网页数据失败,错误代码:{e}\n') time.sleep(1) time.sleep(5)
运行效果:
附完整源码:
#金点设计奖采集 # -*- coding: utf-8 -*- #20200103 by 微信:huguo00289 import requests from fake_useragent import UserAgent import json,re,os,time from lxml import etree from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库 import urllib.parse def ua(): ua=UserAgent() headers={"User-Agent":ua.random,} return headers def get_session(furl): s = requests.session() s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) try: fresponse = s.get(furl, headers=ua(), timeout=10) except requests.exceptions.RequestException as e: print(f'访问链接失败,错误代码:{e}') with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl}-访问链接失败,错误代码:{e}\n') print(fresponse.status_code) time.sleep(2) return s def get_req(s,i): url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php" data={ 'action': 'presscore_template_ajax', 'postID': '15317', 'paged': '1', 'targetPage': i, 'term':'', 'orderby':'', 'order':'', 'nonce': '1f3d287a9a', 'contentType': 'portfolio', 'pageData[type]': 'page', 'pageData[template]': 'portfolio', 'pageData[layout]': 'masonry', 'sender': 'more', } response=s.post(url,data=data,headers=ua(),timeout=10) print(response.status_code) if response.status_code==200: req=response.content.decode('utf-8') html=json.loads(req)['html'] h3s=re.findall(r'<h3 class="entry-title"><a target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S) print(len(h3s)) for h3 in h3s: title=h3[1] href=h3[0] data=title,href print(data) get_content(href) time.sleep(2) def get_content(url): #url="http://www.goldenpin.org.tw/project/ps63/" response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8') time.sleep(2) html=etree.HTML(response) category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0] print(category) year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0] print(year) subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0] print(subclassification) h1=html.xpath('//h1[@class="entry-title"]/text()')[0] h1 = re.sub(r'[\|\/\<\>\:\*\?\'\\"]', "_", h1) # 剔除不合法字符 print(h1) # 获取标题 path=f"{year}/{category}/{subclassification}/" os.makedirs(path,exist_ok=True) #创建目录 descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()') description=''.join(descriptions) texts='%s%s%s'%(h1,'\n',description) print(description) imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src') down(h1,imgs,path,texts) def down(h1,imgs,path,texts): try: print(f'>>> 开始保存{h1}.txt文本..') with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f: f.write(texts) print(f'>>> 保存{h1}.txt文本成功!') except Exception as e: print(f'获取详情数据失败,错误代码:{e}') with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}\n') i=1 for img in imgs: img_url=img suffix = os.path.splitext(img_url)[1] # 获取后缀名 img_name = '%s%s%d%s' % (h1,'_',i, suffix) print(f'>>> 开始下载{img_name}图片..') s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) try: r = s.get(img_url, timeout=20, headers=ua()) with open(f'{path}/{img_name}', 'wb') as f: f.write(r.content) print(f'>>> 开始下载{img_name}图片完成!') except requests.exceptions.RequestException as e: print(f'下载图片失败,错误代码:{e}') with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}\n') time.sleep(1) i=i+1 def get_url(): urls=[] categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"] years = ["2019", "2018", "2017", "2016", "2015"] for category in categorys: cate_gory=urllib.parse.quote(category) for year in years: url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}" print(url) urls.append(url) print(len(urls)) return urls def main(): urls=get_url() for furl in urls: print(f'>>> 正在抓取 {furl} 链接数据...') try: s=get_session(furl) for i in range(5, 6): try: get_req(s,i) except Exception as e: print(f'获取网页数据失败,错误代码:{e}') with open(f'json_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}\n') except Exception as e: print(f'获取网页数据失败,错误代码:{e}') with open(f'spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{furl}-获取网页数据失败,错误代码:{e}\n') time.sleep(1) time.sleep(5) if __name__=="__main__": main()