概要
明日から始まるNIPS2017の下準備のため、Paperを一括でDLし、それぞれのPaperのタイトルや要約、ページ番号等の一括取得、及びタイトルと要約はGoogleの機械翻訳にかけるPythonコードを作成しました。雑なコードで恐縮ですが、もし興味があれば使ってみてください。(要約csvだけご入用の方は以下からどうぞ)
summary.csv (308 ダウンロード)
summary_ja.csv (303 ダウンロード)
スクリプト
ダッシュで作っているので雑です。。。が一応動作しています。(Python3.6.1)
スクリプトと同じフォルダにPDFという名前のフォルダを作っておいてください。そこにPDFがDLされます。
import pandas as pd
import requests
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
#import urllib.parse
import re
def getpaperinfo(a, url1):
if "paper" in a.get("href"):
tsession = requests.session()
tres = tsession.get(url1 + a.get("href"))
tres.encoding = 'utf-8'
tsoup = BeautifulSoup(tres.text, "html.parser")
try:
abstract = tsoup.find(attrs={"class": "abstract"}).text
except:
abstract = "No information"
try:
title = tsoup.find(attrs={"name": "citation_title"}).get("content")
except:
title = "No information"
try:
tmp = tsoup.find_all(attrs={"name": "citation_author"})
authors = []
for t in tmp:
authors.append(t.get("content"))
except:
authors = "No information"
try:
date = tsoup.find(attrs={"name": "citation_publication_date"}).get("content")
except:
date = "No information"
try:
conference = tsoup.find(attrs={"name": "citation_conference_title"}).get("content")
except:
conference = "No information"
try:
firstpage = tsoup.find(attrs={"name": "citation_firstpage"}).get("content")
except:
firstpage = "No information"
try:
lastpage = tsoup.find(attrs={"name": "citation_lastpage"}).get("content")
except:
lastpage = "No information"
try:
description = tsoup.find(attrs={"name": "description"}).get("content")
except:
description = "No information"
try:
pdfurl = tsoup.find(attrs={"name": "citation_pdf_url"}).get("content")
r = requests.get(pdfurl, stream=True)
with open('pdf/'+ pdfurl.split("/")[-1], 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
except:
pdfurl = "No information"
df = pd.DataFrame(data=[title, abstract, authors, date, conference, firstpage, lastpage, description, pdfurl])
return df
else:
return None
if __name__ == "__main__":
url1 = "http://papers.nips.cc/"
url2 = "book/advances-in-neural-information-processing-systems-30-2017"
chunk_size=2000
session = requests.session()
res = session.get(url1 + url2)
res.encoding = 'utf-8'
#データ確認
# print(res.text[:1000])
soup = BeautifulSoup(res.text, "html.parser")
#データ確認
# print(soup.text[:100])
alist = soup.findAll("a")
dfs = Parallel(n_jobs=1, verbose=10)([delayed(getpaperinfo)(a, url1) for a in alist])
df = pd.concat(dfs, axis=1).T.reset_index(drop=True)
df.columns = ["title", "abstract", "authors", "date", "conference", "firstpage", "lastpage", "description", "pdfurl"]
df.to_csv("summary.csv")
df.to_pickle("summary.pkl")
df_ja = df[["title", "abstract"]].copy()
url = 'https://translate.google.com/?hl=ja#en/ja/'
for i in df_ja.index:
tmp = df.loc[i]
etitle = tmp["title"]
r = requests.get(url, params={'q': etitle})
pattern = "TRANSLATED_TEXT=\'(.*?)\'"
title = re.search(pattern, r.text).group(1)
df_ja.loc[i, "title"] = title
eabst = tmp["abstract"]
r = requests.get(url, params={'q': eabst})
pattern = "TRANSLATED_TEXT=\'(.*?)\'"
abst = re.search(pattern, r.text).group(1)
df_ja.loc[i, "abstract"] = abst
df_ja.to_csv("summary_ja.csv")
df_ja.to_pickle("summary_ja.pkl")