简介
Colly 是一个用于构建网络爬虫的 Golang 框架。使用 Colly,您可以构建各种复杂的网络爬虫,从简单的爬虫到处理数百万网页的复杂异步网站爬虫。Colly 提供了一个 API 用于执行网络请求和处理接收到的内容(例如与 HTML 文档的 DOM 树交互)。
import jsonimport loggingimport threadingimport timeimport requestsclass IBoxNFT:def __init__(self, phone, sms):self.phone = phoneself.sms = smsclass Crawler(IBoxNFT):def __init__(self, basic, suffix, phone, sms):self.basic = basicself.link = f'{self.basic}/{suffix}'self.un_know = Falseself.start = time.perf_counter()super(Crawler, self).__init__(phone, sms)def login_ibox(self, version=None, new_suffix=None):"""实现IBox的登录功能,登录请求的版本和后缀有所不同,需要对其进行更新:param version: 版本号,初始版本为v1.2:param new_suffix: 登录的后缀:return: 返回登录完成信息"""if not (version, new_suffix):v, s = "v1.1", "/user/login"header = Crawler.public_params("headers")[0]resp = Crawler.crawler_link('post',self.suffix("v1.1", "/user/login"),headers=header,json={"phoneNumber": self.phone, "code": str(self.sms)},)# if resp.get('code') != 1:# print("登录失败,检查登录信息")print("登录信息\n", resp.json())def suffix(self, version, new_suffix):link = f"{self.link.replace('http', 'https').replace('v1.2', str(version))[:46]}{new_suffix}"return link@staticmethoddef public_params(param) -> dict:"""通过闭包返回公共的params参数,其中包括(请求、构造、公共请求头参数)"""def inner():if "crawler_min_nft" in param:return {"classifyId": "","origin": "0","pageSize": "20","sort": "1","page": "1","type": "0"}if "crawler_new_nft" in param:return {"classifyId": "","origin": "0","pageSize": "20","sort": "0","page": "1","type": "0"}if "headers" in param:return {'Accept-Encoding': 'gzip','IB-PLATFORM-TYPE': 'android','Host': 'api-app.ibox.art','language': 'zh-CN','ib-app-version':' 1.1.4','Content-Type': 'application/json; charset=UTF-8','Accept-Language': 'zh-CN','Connection': 'Keep-Alive','user-agent': 'iBoxApp209'},return inner()@propertydef get_links(self):return self.link@get_links.setterdef upgrade_links(self, new_link):"""ibox版本更新迭代很快, {upgrade_links} 用于更新IBox后续的请求URL:param new_link: 更新后的ibox基本链接"""self.link = new_link@propertydef check_status(self) -> bool:return self.un_knowdef crawler_book_nft(self, payload: dict):passdef crawler_min_nft(self):"""获取ibox的最低价格nft, 实时更新"""resp = Crawler.crawler_link('get',self.link,headers=Crawler.public_params("headers")[0],params=Crawler.public_params(self.crawler_min_nft.__name__))nft = dict()for i in self.get_min_price(resp.json()):nft = {'图藏系列': i['albumName'],'价格': i['priceCny'],'图藏URL': "www.ibox.art{}".format(i['thumbPic'])}if not i:self.un_know = Truereturn nft@staticmethoddef get_min_price(args):"""监测价格数据,如果满足条件,触发通知"""for k, v in enumerate(args['data']['list']):if args['data']['list'][k]['albumName'] == 'iBox礼遇系列' \and int(args['data']['list'][k]['priceCny']) < 800:yield args['data']['list'][k]if args['data']['list'][k]['albumName'] == 'iBox纪念系列' \and int(args['data']['list'][k]['priceCny']) < 600:yield args['data']['list'][k]if args['data']['list'][k]['albumName'] == 'iBox赛博生肖系列' \and int(args['data']['list'][k]['priceCny']) > 800:yield args['data']['list'][k]if args['data']['list'][k]['albumName'] == '异星夜袭' \and int(args['data']['list'][k]['priceCny']) > 2000:yield args['data']['list'][k]@staticmethoddef crawler_link(methods, link, headers, params=None, data=None, json=None, proxy=None):try:if 'get' in methods:r = requests.get(link, params=params, headers=headers, proxies=proxy)return rif 'post' in methods:r = requests.post(link, data=data, json=json, proxies=proxy)return rexcept (requests.exceptions.ConnectionError, requests.exceptions.RequestException) as e:logging.error("The Request is fail", e, r.status_code)except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as e:logging.error(e, r.status_code)@staticmethoddef genrator_wtoken():passdef run(self):print(self.crawler_min_nft())if __name__ == '__main__':print("启动爬虫获取ibox")#while True:a = Crawler("http://api-app.ibox.art", "nft-mall-web/v1.2/nft/product/getResellList", "13420280437", 516254)a.login_ibox()# if a.check_status:# break
初始化Collector采集实例
colly.NewCollector(options ...func(*Collector)) *Collector
采集器可以设置多个可选参数:
限制爬取的域名
- colly.AllowedDomains(“www.baidu.com”, “baidu.com”)
设置请求头
- colly.UserAgent(“xy”)
启动异步
- colly.Async(true)
启动调试器
- colly.Debugger(&debug.LogDebugger{})
设置爬取页面的深度, 如果设置为1,只抓取Visit()中的url
- colly.MaxDepth(1)
回调函数
在发起请求前被调用
func (c *Collector) OnRequest(f RequestCallback) {}
在请求过程中发生错误被调用
func (c *Collector) OnError(f RequestCallback) {}
收到请求后被调用
func (c *Collector) OnResponse(f RequestCallback) {}
收到内容按html的jquery进行数据清洗
func (c *Collector) OnHTML(goquerySelector string,f HTMLCallback) {}
收到内容按XML的xpath进行数据清洗
func (c *Collector) OnXML(xpathQuery string,f XMLCallback) {}
在OnHTML之后被调用
func (c *Collector) OnScraped(f ScrapedCallback) {}
