Web上からデータを引っ張ってくる

Web上からデータを引っ張ってくるコード

HEP Postdoc Rumor Mill の2013年～2021年のデータから各人間がどこからオファーが来て、どの機関のオファーをacceptしたかを抽出。
例として、以下のデータを出力するようにしている（※inputは読み取り完了の合図（load completed）の後に行う）。

input:

Institute A

output:

vs. Institute B: a/t

t = AとBの両方からオファーをもらった人の人数
a = tのうち、A機関を選んだ人数

※　webdriver と　chromedriver が必要。コード中のchromedriverのディレクトリ指定を適当に変更して使用する。

（実行例）
(IAS, UC Berkeley, Kavli IPMU 等の機関名を入力して下さい。)

input:

output:

（コード）


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from collections import defaultdict

def purify(inst,stat):
    # 機関名の表記の揺れを統一
    if "It from Qubit" in inst or "It from Qubit" in stat:
        return "It from Qubit"
    elif "Princeton" in inst:
        return "Princeton"
    elif "Stanford" in inst:
        return "Stanford"
    elif "Berkeley" in inst:
        return "UC Berkeley"
    elif "KITP" in inst:
        return "KITP"
    elif "Fermilab" in inst:
        return "Fermilab"
    elif "New York U." in inst:
        return "New York U."
    elif "Peking U." in inst:
        return "Peking U."
    elif "Tsinghua U." in inst:
        return "Tsinghua U."
    elif "UC Davis" in inst:
        return "UC Davis"
    elif "AEI" in inst:
        return "Albert Einstein Institute"
    elif "IPMU" in inst:
        return "Kavli IPMU"
    elif "Cambridge" in inst:
        return "Cambridge U."
    elif "McGill" in inst:
        return "McGill"
    else:
        return inst

priority=defaultdict(int) #ここに抽出データを保存


# html取得 (2013~2016 とそれ以降でhtmlの形式が変わっているので、別々に処理する必要がある。以下では2017~を扱っている。)
option = Options()
option.add_argument('--headless')
driver = webdriver.Chrome("C:\ProgramData\Anaconda3\Lib\site-packages\selenium\webdriver\chromedriver.exe", options=option)

links=[
"https://docs.google.com/spreadsheets/u/0/d/1tZAdZ_Q44CnGGvCWnPDXVBiYHxF-ti0G5RWJwXBwLtk/htmlembed/sheet?headers=false&gid=0",
"https://docs.google.com/spreadsheets/u/0/d/11Itq4eZBzNsjlSQGnfYib6Obiw59yToG34cC_eBYXeo/htmlembed/sheet?headers=false&gid=0",
"https://docs.google.com/spreadsheets/u/0/d/1GV_ebj7-Itxy7Pb19Yja7a1f0J2FXuGYvMkRBhmuSBo/htmlembed/sheet?headers=false&gid=0",
"https://docs.google.com/spreadsheets/u/0/d/1u4RuIFetw_ZNg0APlXhnbkqqYzjS2xSV9sBx0NxxfPo/htmlembed/sheet?headers=false&gid=0",
"https://docs.google.com/spreadsheets/u/0/d/1iMsLRnNNHFKmdq7ltrp9BR4jDd-Ots6UYDK2dgyRLZ0/htmlembed/sheet?headers=false&gid=0"
]

for link in links:
    driver.get(link)
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")

    accepted=dict()
    declined=defaultdict(list)
    offered=defaultdict(list)

    for tbody in soup.find("tbody"):
        name, link, inst, stat, time=[td.get_text().strip() for td in tbody.find_all("td")]
        # name, link, inst, stat, time は見た目の通りのデータなので、これを使い道に応じて弄ると良い
        inst=purify(inst,stat)
        if "Accepted" in stat:
            accepted[name]=inst
        if "Declined" in stat:
            declined[name].append(inst)
        if "Offered" in stat:
            offered[name].append(inst)

    for name, inst1 in accepted.items():
        if not declined.get(name): continue
        for inst2 in declined[name]:
            priority[inst1,inst2]+=1
        for inst2 in offered[name]:
            if inst1!=inst2 and inst2 not in declined[name]:
                priority[inst1,inst2]+=1

# html取得 (2013~2016 とそれ以降でhtmlの形式が変わっているので、別々に処理する必要がある。以下では~2016を扱っている。)
links=[
"https://sites.google.com/site/postdocrumor/2016-rumors",
"https://sites.google.com/site/postdocrumor/2015-rumors",
"https://sites.google.com/site/postdocrumor/2014-rumors",
"https://sites.google.com/site/postdocrumor/2013-rumors"
]

for link in links:
    driver.get(link)

    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")

    accepted=dict()
    declined=defaultdict(list)
    offered=defaultdict(list)

    for tr in soup.find("table", id="goog-ws-list-table").find_all("tr")[1:]:
        name, inst, stat, time=[td.get_text().strip() for td in tr.find_all("td")]
        # name, inst, stat, time は見た目の通りのデータなので、これを使い道に応じて弄ると良い
        inst=purify(inst,stat)
        if "Accepted" in stat:
            accepted[name]=inst
        if "Declined" in stat:
            declined[name].append(inst)
        if "Offered" in stat:
            offered[name].append(inst)

    for name, inst1 in accepted.items():
        if not declined.get(name): continue
        for inst2 in declined[name]:
            priority[inst1,inst2]+=1
        for inst2 in offered[name]:
            if inst1!=inst2 and inst2 not in declined[name]:
                priority[inst1,inst2]+=1


#######################################################################################

# 優先度の不等式をグラフにしている。こうするとプログラマー的に扱いやすい気がする。
# 強連結成分分解して色々したら、全体ランキングみたいなのを作れるかなとも思ったが、上手くいかず。
# いいアイデアください。

code=dict()
decode=dict()
N=0
edges=[]
inv_edges=[]
for (a,b),cost in priority.items():
    if priority[a,b]>priority.get((b,a),0):
        if not code.get(a): code[a]=N; decode[N]=a; N+=1; edges.append([]); inv_edges.append([])
        if not code.get(b): code[b]=N; decode[N]=b; N+=1; edges.append([]); inv_edges.append([])
        A,B=code[a],code[b]
        edges[A].append(B)
        inv_edges[B].append(A)


#######################################################################################

print("*lord completed*")

# ここから、対話形式を実現するコード
# 機関名 A の入力に対して、抽出したデータを元に色々する。

while True:
    A=input().rstrip()
    if not code.get(A): print("No Comparisons"); continue
    a=code[A]
    res1=[]
    res2=[]
    for b in edges[a]:
        B=decode[b]
        x,y=priority.get((A,B),0),priority.get((B,A),0)
        p=x/(x+y)
        res1.append((B,p,x,y))
    for b in inv_edges[a]:
        B=decode[b]
        x,y=priority.get((A,B),0),priority.get((B,A),0)
        p=x/(x+y)
        res2.append((B,p,x,y))
    res1.sort(key=lambda x:x[2]+x[3], reverse=True)
    res1.sort(key=lambda x:x[1],reverse=True)
    res2.sort(key=lambda x:x[2]+x[3], reverse=True)
    res2.sort(key=lambda x:x[1],reverse=True)
    for inst, p, x, y in res1:
        print("vs. {}: {}/{}".format(inst,x,x+y))
    for inst, p, x, y in res2:
        print("vs. {}: {}/{}".format(inst,x,x+y))