コンテンツにスキップ

英文维基 | 中文维基 | 日文维基 | 草榴社区

利用者:Bcxfubot/BOT作業依頼/log/20240517-4/prog

#!/usr/bin/python
# pylint: disable=consider-using-f-string
"""
[orig] honbun_archive.py
URL張替
[http://www.blog.com/blog.html ページタイトル]

[https://web.archive.org/20210503125012/http://www.blog.com/blog.html ページタイトル]


一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
→ 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13
"""

import re
from urllib.parse import urlparse
import time
import urllib.parse
import pywikibot
import requests
from requests.exceptions import Timeout

TARGET_RE = r"https:\/\/www\.toei\.co\.jp\/tv\/"
MAXCOUNT = 120
SLEEPSEC = 60

######################################################
# 処理モード
#PROCMODE = 0
PROCMODE = 1
######################################################


#def get_domain(url):
#    parsed_uri = urlparse(url )
#    result = '{uri.netloc}'.format(uri=parsed_uri)
#    return result
def get_domain(target):
    """ ドメインを取得する """
    url = ""
    result = re.search ( "(http[^  ]+)", target)
    if result:
        url = result.group(1)
    else:
        return target
    parsed_uri = urlparse(url )
    result = '{uri.netloc}'.format(uri=parsed_uri)
    return result

def get_date_core(origurl):
    """ 日付取得処理core """
    encoded_url = urllib.parse.quote(origurl, safe="")
    print("encoded_url = "+ encoded_url)
    #spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url + \
    # "&collection=web&output=json"
    api_url = "https://archive.org/wayback/available?url=" + encoded_url + "&timestamp=21010101"
    print("api_url = "+ api_url)

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
    headers = {'User-Agent': user_agent}
    try:
        #response = requests.get(spark_url,timeout=60.0)
        response = requests.get(api_url,timeout=60.0,headers=headers)
    except Timeout:
        print("ERROR: timeout")
        raise
    except Exception as err:
        print("ERROR: Exception")
        print(err)
        raise
    print("response.text = " + response.text)
    if "504 Gateway Time-out" in response.text:
        return ""

    data = response.json()
    print(data)
    #date = data["last_ts"]
    #date = data["first_ts"]
    try:
        date = data["archived_snapshots"]["closest"]["timestamp"]
    except TypeError:
        date = ""
    except KeyError:
        date = ""
    print(date)
    return date

def get_date(origurl):
    """ 日付取得処理 """
    result = ""
    for i in range(2):
        result = get_date_core(origurl)
        if result != "":
            break
        print("sleep(10), i=" + str(i))
        time.sleep(10)

    return result


def get_stat( url ):
    """ status取得処理 """
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
    headers = {'User-Agent': user_agent}
    try:
        if ".pdf" in url:
            response = requests.head(url,headers=headers,timeout=5.0)
        else:
            response = requests.get(url,headers=headers,timeout=5.0)
    except requests.exceptions.ConnectionError:
        print("ERROR: ConnectionError")
        return 404
    except Timeout:
        print("ERROR: timeout")
        raise
    except Exception as err:
        print("ERROR: Exception")
        print(err)
        raise
    print(response.status_code)
    print(response.url)
    return response.status_code

def is_domain_webarchive( origurl, origline):
    """ urlのドメインが同じ行にweb.archive.orgとして存在している場合は1を返す。 """
    domain = ""
    result = re.search( r"https?:\/\/([0-9\.a-zA-Z_-]+)\/", origurl)
    if result:
        domain = result.group(1)

    print("domain="+domain)
    regexp = r"https?:\/\/" + domain
    print("regexp="+regexp)
    #regexp = regexp.replace("\/","\\\/")
    #regexp = regexp.replace("\.","\\\.")
    pattern = r"https:\/\/web\.archive\.org\/web\/[0-9a-z_]+\/" + regexp
    print("pattern="+pattern)
    if re.search( pattern, origline):
        return True
    return False



def make_newline( origline ):
    """ 新行を作成する """
    newline = ""
    #result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
    #result = re.search( TARGET_RE , origline )
    #result = re.search( "^(.*)\[(" + TARGET_RE + "[^ ]+) ([^\]]*)\](.*)$", origline)
    print("origline="+origline)
    #result = re.search( "(" + TARGET_RE + "[^ \]\}\|]+)", origline)
    #result = re.search( "\[(" + TARGET_RE + "[^  \]\}\|]+)", origline)
    #result = re.search( "\[(" + TARGET_RE + "[^  \]\}\|]*)", origline)
    #result = re.search( "(" + TARGET_RE + "[^  \]\}\|]*)", origline)
    #result = re.search( "(" + TARGET_RE + "[^  \]\}\|\<]*)", origline)
    #matchedlist = re.findall( "(" + TARGET_RE + "[^  \]\}\|\<]*)", origline)
    #matchedlist = re.findall( "[^\/]" + TARGET_RE + "[^  \]\}\|\<]*", origline)
    pattern = r"https?://[^  \t\|\]\}<\)]+"
    matchedlist = re.findall( pattern, origline)
    newline = origline
    if matchedlist:
        for url in matchedlist:
            result = re.search( r"(http.*[^\-])-->$", url)
            if result:
                url = result.group(1)

            origurl = url
            print("origurl = " + origurl)
            result = re.search( "web.archive.org", origurl)
            if result:
                print( "ERR: This is archive.org. pass")
                continue
            if is_domain_webarchive( origurl, origline):
                print( "ERR: is_domain_webarchive() is True. pass")
                continue

            result = re.search( TARGET_RE, origurl)
            if not result:
                continue

            #stat = get_stat( origurl )
            #if stat not in (400, 404, 405, 410):
            #    continue


            date = get_date( origurl )
            if date == "":
                #return ""
                continue
            print("date = " + date)
            ardate = date

            newurl = "https://web.archive.org/web/" + ardate + "/" + origurl
            #newline = origline.replace( origurl, newurl)
            #newline = newline.replace( origurl, newurl)
            # →{{Cite web|url=AAA|archiveurl=AAA}}
            # の場合に、archiveurlの方を置換しないように1回の置換にとどめる。2020.11.19
            newline = newline.replace( origurl, newurl, 1)
            print("newline = " + newline)
            # archive.orgの二重書きしていないことをチェック2020.7.8
            result = re.search( r"https:\/\/web\.archive\.org\/(web\/)?[0-9]+\/"\
                r"https:\/\/web\.archive\.org\/", newline)
            if result:
                print("ERROR: web.archive.org 二重書き")
                raise Exception
    return newline

def mk_comment():
    """ コメントを作成する """
    if "http:" in TARGET_RE:
        com1 = "http:// -> web.archive.org"
    else:
        com1 = "https:// -> web.archive.org"

    com2 = "(" + get_domain( TARGET_RE.replace("\\","") ) + ")"
    com3 = "[[Wikipedia:Bot作業依頼#東映のリンク切れをウェイバックマシンに置換]] ([[Wikipedia:Bot|Bot]]による編集)"

    comment = "外部リンクの修正 " + com1 + " " + com2 + " " + com3
    return comment

def replace_page(site,pagetitle):
    """ ページを更新する """
    is_saved = False
    page = pywikibot.Page(site, pagetitle)
    #text = page.text
    #print(text)
    linelist = page.text.split('\n')
    #print(linelist)

    comment = ""
    gaibu = 0
    modflag = 0
    outtext = ""
    for line in linelist:
        ## archiveurlの行は二重書き換えが発生しやすいので除外する 2020.11.19
        #tmp_re = TARGET_RE + ".*" + TARGET_RE
        if ( re.search(r"[Ww]ayback",line) or
            #re.search("archiveurl", line) or
            #re.search(tmp_re, line) or
            re.search(r"[Aa]rchive\.(is|ph|li|fo|vn|md|today)", line) or
            re.search(r"[wW]eb[aA]rchive", line) ):
            outtext += line + "\n"
            continue
        #print(gaibu,line)
        if re.search(TARGET_RE,line):
            newline = make_newline( line )
            if newline != "":
                if line != newline:
                    line = newline
                    comment = newline
                    print(gaibu,line)
                    modflag = 1
        outtext += line + "\n"

    # 最終行の改行の調整
    if page.text[-1:] != "\n":
        if outtext[-1:] == "\n":
            outtext = outtext[:-1]

    difflen = len(outtext) - len(page.text)
    print("difflen=" + str(difflen))
    if ( ( difflen < -30 ) or
         ( difflen > 2600 ) ):
        raise Exception

    if modflag == 1:
        page.text = outtext
        if PROCMODE == 1:
            comment = mk_comment()
            page.save(comment)
            is_saved = True
    return is_saved

# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
    """ 処理対象のページ名をひとつ返す """
    path = "list"
    with open(path, encoding="utf-8") as file:
        for s_line in file:
            s_line = s_line.rstrip("\n")
            #print(s_line)
            #if not re.search(",sumi", s_line):
            if not s_line.endswith(",sumi"):
                return s_line
    return ""

# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
    """ 処理した行にsumiをつける """
    path = "list"
    alltext = ""
    with open(path, encoding="utf-8") as file:
        for s_line in file:
            s_line = s_line.rstrip("\n")
            #print(s_line + "\n")
            #if re.search(pagetitle, s_line):
            if pagetitle == s_line:
                s_line = s_line + ",sumi"
            alltext += s_line + "\n"
    with open(path, mode='w', encoding="utf-8") as file:
        file.write(alltext)
    return ""

def sub():
    """ sub """
    site = pywikibot.Site()
    site.login()
    for i in range(MAXCOUNT):
        pagetitle = get_pagetitle()
        print("[" + str(i + 1) + "/" + str(MAXCOUNT) + "]" + ":" + "pagetitle=" + pagetitle)
        if pagetitle == "":
            break
        is_saved = replace_page(site,pagetitle)
        done_pagetitle(pagetitle)

        #if ( i < (MAXCOUNT - 1) ):
        #    print("sleep(" + str(SLEEPSEC) + ")")
        #    time.sleep(SLEEPSEC)
        print("is_saved=" + str(is_saved))
        if is_saved:
            print("sleep(" + str(SLEEPSEC) + ")")
            time.sleep(SLEEPSEC)
        else:
            print("sleep(" + str(5) + ")")
            time.sleep(5)

def main():
    """ main """
    sub()
    print("done.")

if __name__ == '__main__':
    main()