概要
明日から始まるNIPS2017の下準備のため、Paperを一括でDLし、それぞれのPaperのタイトルや要約、ページ番号等の一括取得、及びタイトルと要約はGoogleの機械翻訳にかけるPythonコードを作成しました。雑なコードで恐縮ですが、もし興味があれば使ってみてください。(要約csvだけご入用の方は以下からどうぞ)
summary.csv (247 ダウンロード)
summary_ja.csv (242 ダウンロード)
スクリプト
ダッシュで作っているので雑です。。。が一応動作しています。(Python3.6.1)
スクリプトと同じフォルダにPDFという名前のフォルダを作っておいてください。そこにPDFがDLされます。
import pandas as pd import requests from bs4 import BeautifulSoup from joblib import Parallel, delayed #import urllib.parse import re def getpaperinfo(a, url1): if "paper" in a.get("href"): tsession = requests.session() tres = tsession.get(url1 + a.get("href")) tres.encoding = 'utf-8' tsoup = BeautifulSoup(tres.text, "html.parser") try: abstract = tsoup.find(attrs={"class": "abstract"}).text except: abstract = "No information" try: title = tsoup.find(attrs={"name": "citation_title"}).get("content") except: title = "No information" try: tmp = tsoup.find_all(attrs={"name": "citation_author"}) authors = [] for t in tmp: authors.append(t.get("content")) except: authors = "No information" try: date = tsoup.find(attrs={"name": "citation_publication_date"}).get("content") except: date = "No information" try: conference = tsoup.find(attrs={"name": "citation_conference_title"}).get("content") except: conference = "No information" try: firstpage = tsoup.find(attrs={"name": "citation_firstpage"}).get("content") except: firstpage = "No information" try: lastpage = tsoup.find(attrs={"name": "citation_lastpage"}).get("content") except: lastpage = "No information" try: description = tsoup.find(attrs={"name": "description"}).get("content") except: description = "No information" try: pdfurl = tsoup.find(attrs={"name": "citation_pdf_url"}).get("content") r = requests.get(pdfurl, stream=True) with open('pdf/'+ pdfurl.split("/")[-1], 'wb') as fd: for chunk in r.iter_content(chunk_size): fd.write(chunk) except: pdfurl = "No information" df = pd.DataFrame(data=[title, abstract, authors, date, conference, firstpage, lastpage, description, pdfurl]) return df else: return None if __name__ == "__main__": url1 = "http://papers.nips.cc/" url2 = "book/advances-in-neural-information-processing-systems-30-2017" chunk_size=2000 session = requests.session() res = session.get(url1 + url2) res.encoding = 'utf-8' #データ確認 # print(res.text[:1000]) soup = BeautifulSoup(res.text, "html.parser") #データ確認 # print(soup.text[:100]) alist = soup.findAll("a") dfs = Parallel(n_jobs=1, verbose=10)([delayed(getpaperinfo)(a, url1) for a in alist]) df = pd.concat(dfs, axis=1).T.reset_index(drop=True) df.columns = ["title", "abstract", "authors", "date", "conference", "firstpage", "lastpage", "description", "pdfurl"] df.to_csv("summary.csv") df.to_pickle("summary.pkl") df_ja = df[["title", "abstract"]].copy() url = 'https://translate.google.com/?hl=ja#en/ja/' for i in df_ja.index: tmp = df.loc[i] etitle = tmp["title"] r = requests.get(url, params={'q': etitle}) pattern = "TRANSLATED_TEXT=\'(.*?)\'" title = re.search(pattern, r.text).group(1) df_ja.loc[i, "title"] = title eabst = tmp["abstract"] r = requests.get(url, params={'q': eabst}) pattern = "TRANSLATED_TEXT=\'(.*?)\'" abst = re.search(pattern, r.text).group(1) df_ja.loc[i, "abstract"] = abst df_ja.to_csv("summary_ja.csv") df_ja.to_pickle("summary_ja.pkl")