コンテンツにスキップ

英文维基 | 中文维基 | 日文维基 | 草榴社区

利用者:Bcxfubot/BOT作業依頼/log/20210323/prog2

# [orig] first_archive.py
# first_archive.pyは、新聞ニュース記事などで使う。最初のアーカイブが重要なやつ。
# [http://www.blog.com/blog.html ページタイトル]
# ↓
# {{Wayback|url=http://www.blog.com/blog.html |title=ページタイトル}}


# 一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
# → 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13

import re
import time
import pywikibot
import urllib.parse
import requests
from urllib.parse import urlparse
from requests.exceptions import Timeout

target = "http://www.ntv.co.jp/kinro/lineup/"
target_re = "http:\/\/www\.ntv\.co\.jp\/kinro\/lineup\/"

#max = 10
#max = 10
#max = 120
max = 120
sleepsec = 60

######################################################
# 処理モード
#procmode = 0
procmode = 1
######################################################

def get_domain(target):
    url = ""
    result = re.search ( "(http[^  ]+)", target)
    if result:
        url = result.group(1)
    else:
        return target
    parsed_uri = urlparse(url )
    result = '{uri.netloc}'.format(uri=parsed_uri)
    return result

def get_date_core(origurl):
    encoded_url = urllib.parse.quote(origurl, safe="")
    print("encoded_url = "+ encoded_url)
    #spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url + "&collection=web&output=json"
    api_url = "https://archive.org/wayback/available?url=" + encoded_url + "&timestamp=20010101"
    #print("spark_url = "+ spark_url)
    print("api_url = "+ api_url)

    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
    headers = {'User-Agent': ua}
    try:
        #response = requests.get(spark_url,timeout=60.0)
        response = requests.get(api_url,timeout=60.0,headers=headers)
    except Timeout:
        print("ERROR: timeout")
        raise
        return ""
    except Exception as e:
        print("ERROR: Exception")
        print(e)
        raise
        return ""
    print("response.text = " + response.text)
    #if "\"first_ts\":null" in response.text:
    #    return ""
    if "504 Gateway Time-out" in response.text:
        return ""

    data = response.json()
    print(data)
    #lastdate = data["last_ts"]
    #print(lastdate)
    #return lastdate
    #firstdate = data["first_ts"]
    try:
        firstdate = data["archived_snapshots"]["closest"]["timestamp"]
    except Exception as e:
        firstdate = ""
    print(firstdate)
    return firstdate

def get_date(origurl):
    result = ""
    for i in range(3):
        result = get_date_core(origurl)
        if result != "":
            break
        print("sleep(30)")
        time.sleep(30)

    return result


def make_newline( origline ):
    #print("make_newline(): origline=" + origline)
    newline = ""
    #result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
    #result = re.search( target_re , origline )
    result = re.search( "^(.*)\[(" + target_re + "[^  ]*)[  ]+([^\]]*)\](.*)$", origline)
    if result:
        pre = result.group(1)
        print("pre="+pre)
        origurl = result.group(2)

        origtext = result.group(3)
        post = result.group(4)
        print("origurl = " + origurl)
        print("origtext = " + origtext)

        # origtextに[[]]内部リンクがある場合はうまく処理できないのでここではじく2020.7.24
        result2 = re.search( "\[\[", origtext)
        if result2:
            return origline

        date = get_date( origurl )
        #if date == "":
        #    return ""
        if date == "":
            date = "*"



        print("date = " + date)
        origtext = origtext.replace("|", "|")
        newline = pre + "{{Wayback|url=" + origurl + " |title=" + origtext + " |date=" + date + "}}" + post
        print("newline = " + newline)
    return newline

def replace_page(pagetitle):
    site = pywikibot.Site()
    page = pywikibot.Page(site, pagetitle)
    #text = page.text
    #print(text)
    linelist = page.text.split('\n')
    #print(linelist)

    comment = ""
    gaibu = 0
    modflag = 0
    outtext = ""
    for line in linelist:
        if (re.search("<ref",line) or
            re.search("ref>",line) or
            re.search("web.archive.org",line) or
            re.search("Wayback",line) ):
            outtext += line + "\n"
            continue
        #print(gaibu,line)
        result = re.search( "^\*", line)
        if result:
            if target in line:
                newline = make_newline( line )
                if newline != "":
                    if line != newline:
                        line = newline
                        comment = newline
                        print(gaibu,line)
                        modflag = 1
        outtext += line + "\n"

    if modflag == 1:
        page.text = outtext
        if procmode == 1:
            #page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> {{Wayback}} ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> {{Wayback}} (" + get_domain( target_re.replace("\\","") ) + ") ([[Wikipedia:Bot|Bot]]による編集)")
            page.save("[[Wikipedia:Bot作業依頼#金曜ロードSHOW!のリンク切れをウェイバックマシンに置換]] ([[Wikipedia:Bot|Bot]]による編集)")

# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
    path = "list"
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line)
            #if not re.search(",sumi", s_line):
            if not s_line.endswith(",sumi"):
                return s_line
    return ""

# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
    path = "list"
    alltext = ""
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line + "\n")
            #if re.search(pagetitle, s_line):
            if pagetitle == s_line:
                s_line = s_line + ",sumi"
            alltext += s_line + "\n"
    with open(path, mode='w') as f:
        f.write(alltext)
    return ""

def sub():
    num = 0
    for i in range(max):
        num = num + 1
        pagetitle = get_pagetitle()
        print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle)
        if pagetitle == "":
            break
        replace_page(pagetitle)
        done_pagetitle(pagetitle)
        
        if ( i < (max - 1) ):
            print("sleep(" + str(sleepsec) + ")")
            time.sleep(sleepsec)

def main():
    sub()
    print("done.")

if __name__ == '__main__':
    main()