ArxivExtractor.py
#!/usr/bin/env python# -*- encoding: utf-8 -*-'''@File : ArxivExtractor.py@Contact : htkstudy@163.com@Modify Time @Author @Version @Desciption------------ ------- -------- -----------2021/5/10 21:40 Armor(htk) 1.0 None'''import reimport timeimport numpy as npimport randomfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom email.mime.image import MIMEImagedef seed_email(file_name,receivers_list):#设置服务器所需信息#163邮箱服务器地址mail_host = 'smtp.163.com'#163用户名mail_user = '*****@163.com'#密码(部分邮箱为授权码)mail_pass = '**********'#邮件发送方邮箱地址sender = '*****@163.com'#邮件接受方邮箱地址,注意需要[]包裹,这意味着你可以写多个邮件地址群发receivers = receivers_list #如 receivers = ['*****@qq.com']#设置email信息#邮件内容设置message = MIMEMultipart()message['From'] = sendermessage['To'] = receivers[0]#邮件主题message['Subject'] = 'Arxiv每日推送'#推荐使用html格式的正文内容,这样比较灵活,可以附加图片地址,调整格式等with open(file_name,'r',encoding="utf-8") as f:content = f.read()#设置html格式参数part1 = MIMEText(content,'html','utf-8')#将内容附加到邮件主体中message.attach(part1)#登录并发送邮件try:smtpObj = smtplib.SMTP()#连接到服务器smtpObj.connect(mail_host,25)#登录到服务器smtpObj.login(mail_user,mail_pass)#发送smtpObj.sendmail(sender,receivers,message.as_string())#退出smtpObj.quit()print('success')except smtplib.SMTPException as e:print('error',e) #打印错误class GeneralArxivExtractor(object):def extract(self,html):title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]print(title)subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]return 0def simple(self,html,spical):title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]print(title)flags = [1 if key in title else 0 for key in spical]if np.array(flags).sum() > 0:subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'tail = '</body></html>'html = front + subhtml + frontwith open("html/{}.html".format(title),"w",encoding='utf-8') as f:f.write(html)passclass SelenuimLoading(object):def __init__(self):# option配置self.chrome_options = Options()self.chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bugself.chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面self.chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.chrome_options)def Spider_Html(self,url):self.driver.implicitly_wait(60)self.driver.get(url)time.sleep(random.randint(5,8))html = self.driver.page_sourcereturn htmldef close(self):self.driver.close()passif __name__ == '__main__':org = 12056sp = ['人工智能','机器学习','计算机视觉','自然语言处理']# 生成urlsurl_format = "https://www.arxivdaily.com/thread/{}"urls = [url_format.format(i) for i in np.arange(970,11152)[::-1]]# 加载 浏览器web = SelenuimLoading()# 加载提取器extractor = GeneralArxivExtractor()# 抓取urls中的html页面for idx,url in enumerate(urls):try:# 提取htmlhtml = web.Spider_Html(url)timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())print("{} {}/{} ".format(timeput,idx+1,len(urls)),sep=" ")# 抓取网页extractor.simple(html.replace("\n",""),sp)except:print(urls[idx],"出错")web.close()
SeedArxiv.py
import reimport timeimport numpy as npimport randomimport osimport datetimefrom ArxivExtractor import seed_emailfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver import ChromeOptionsclass GeneralArxivExtractor(object):def extract(self,html):title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]print(title)subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]return 0def simple(self,html,spical):title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]print(title)flags = [1 if key in title else 0 for key in spical ]if np.array(flags).sum() > 0:subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'tail = '</body></html>'html = front + subhtml + frontwith open("html/{}.html".format(title),"w",encoding='utf-8') as f:f.write(html)passclass SelenuimLoading(object):def __init__(self):# option配置self.options = Options()self.chrome_options = ChromeOptions()self.chrome_options.add_argument('headless') # 设置optionself.options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bugself.options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面self.options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.options,chrome_options=self.chrome_options)def Spider_Html(self,url):self.driver.implicitly_wait(60)self.driver.get(url)first = self.driver.find_element_by_xpath(r'//*[@id="__layout"]/div/div[2]/div/main/div[2]/div[1]/div[1]/div/div/div/div[2]/a/div/span')first.click()self.driver.switch_to.window(self.driver.window_handles[-1])time.sleep(random.randint(5,8))html = self.driver.page_sourcereturn htmldef close(self):self.driver.quit()passdef scray(choose):urls = {'计算机视觉':"https://www.arxivdaily.com/category/19?search_ids=19",'自然语言处理':"https://www.arxivdaily.com/category/20?search_ids=20",'人工智能':'https://www.arxivdaily.com/category/21?search_ids=21','机器学习':'https://www.arxivdaily.com/category/22?search_ids=22'}url = urls[choose]# 加载 浏览器web = SelenuimLoading()# 加载提取器extractor = GeneralArxivExtractor()# 抓取urls中的html页面try:# 提取htmlhtml = web.Spider_Html(url)timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())print("{}".format(timeput),sep=" ")# 抓取网页extractor.simple(html.replace("\n",""),urls.keys())except Exception as e:# 访问异常的错误编号和详细信息print(url,"出错")print(e.args)web.close()if __name__ == '__main__':choose = ["自然语言处理","计算机视觉","人工智能","机器学习"]receiver_lists = ['1939454633@qq.com','htkstudy@163.com','htkstudy@163.com']flag = Truewhile True:scray("自然语言处理")current_time = datetime.datetime.now()current_time_string = str(current_time.year)+"_"+str(current_time.month)+"_"+str(current_time.day)for file_name in os.listdir("html"):if current_time_string in file_name and "自然语言处理" in file_name and ".html" in file_name:seed_email("html/"+file_name,receiver_lists)flag = Falsebreakif flag:print("未更新")else:breaktime.sleep(20)
