コンテンツにスキップ

英文维基 | 中文维基 | 日文维基 | 草榴社区

利用者:Bcxfubot/BOT作業依頼/log/20210203/prog3

# [orig] honbun_archive.py
# URL張替
# [http://www.blog.com/blog.html ページタイトル]
# ↓
# {{Wayback|url=http://www.blog.com/blog.html |title=ページタイトル}}


# 一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
# → 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13

import re
from urllib.parse import urlparse
import time
import pywikibot
import urllib.parse
import requests

target = "http://www.waterblue.co.jp/women/"
target_re = "http:\/\/www\.waterblue\.co\.jp\/women\/"
#timestamp = "20010101"
timestamp = "20500101"

#max = 10
#max = 10
#max = 120
max = 120
sleepsec = 60

######################################################
# 処理モード
#procmode = 0
procmode = 1
######################################################

#def get_domain(url):
#    parsed_uri = urlparse(url )
#    result = '{uri.netloc}'.format(uri=parsed_uri)
#    return result
def get_domain(target):
    url = ""
    result = re.search ( "(http[^  ]+)", target)
    if result:
        url = result.group(1)
    else:
        return target
    parsed_uri = urlparse(url )
    result = '{uri.netloc}'.format(uri=parsed_uri)
    return result

def get_date_core(origurl):
    encoded_url = urllib.parse.quote(origurl, safe="")
    print("encoded_url = "+ encoded_url)
    #spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url + "&collection=web&output=json"
    #api_url = "https://archive.org/wayback/available?url=" + encoded_url + "&timestamp=20010101"
    api_url = "https://archive.org/wayback/available?url=" + encoded_url + "&timestamp=" + timestamp
    print("api_url = "+ api_url)

    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
    headers = {'User-Agent': ua}
    try:
        #response = requests.get(spark_url,timeout=60.0)
        response = requests.get(api_url,timeout=60.0,headers=headers)
    except Timeout:
        print("ERROR: timeout")
        raise
        return ""
    except Exception as e:
        print("ERROR: Exception")
        print(e)
        raise
        return ""
    print("response.text = " + response.text)
    if "504 Gateway Time-out" in response.text:
        return ""

    data = response.json()
    print(data)
    #date = data["last_ts"]
    #date = data["first_ts"]
    try:
        date = data["archived_snapshots"]["closest"]["timestamp"]
    except Exception as e:
        date = ""
    print(date)
    return date

def get_date(origurl):
    result = ""
    for i in range(3):
        result = get_date_core(origurl)
        if result != "":
            break
        print("sleep(30)")
        time.sleep(30)

    return result


def make_newline( origline ):
    newline = ""
    #result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
    #result = re.search( target_re , origline )
    #result = re.search( "^(.*)\[(" + target_re + "[^ ]+) ([^\]]*)\](.*)$", origline)
    print("origline="+origline)
    #result = re.search( "(" + target_re + "[^ \]\}\|]+)", origline)
    #result = re.search( "\[(" + target_re + "[^  \]\}\|]+)", origline)
    #result = re.search( "\[(" + target_re + "[^  \]\}\|]*)", origline)
    #result = re.search( "(" + target_re + "[^  \]\}\|]*)", origline)
    #result = re.search( "(" + target_re + "[^  \]\}\|\<]*)", origline)
    #matchedlist = re.findall( "(" + target_re + "[^  \]\}\|\<]*)", origline)
    #matchedlist = re.findall( "[^\/]" + target_re + "[^  \]\}\|\<]*", origline)
    pattern = r"https?://[^  \t\|\]\}<\)]+"
    matchedlist = re.findall( pattern, origline)
    newline = origline
    if matchedlist:
        for url in matchedlist:
            origurl = url
            print("origurl = " + origurl)
            result = re.search( "web.archive.org", origurl)
            if result:
                print( "ERR: This is archive.org. pass")
                continue
            result = re.search( target_re, origurl)
            if not result:
                continue
            date = get_date( origurl )
            if date == "":
                #return ""
                continue
            print("date = " + date)
            ardate = date

            newurl = "https://web.archive.org/web/" + ardate + "/" + origurl
            #newline = origline.replace( origurl, newurl)
            #newline = newline.replace( origurl, newurl)
            # →{{Cite web|url=AAA|archiveurl=AAA}}
            # の場合に、archiveurlの方を置換しないように1回の置換にとどめる。2020.11.19
            newline = newline.replace( origurl, newurl, 1)
            print("newline = " + newline)
            # archive.orgの二重書きしていないことをチェック2020.7.8
            result = re.search( "https:\/\/web\.archive\.org\/(web\/)?[0-9]+\/https:\/\/web\.archive\.org\/", newline)
            if result:
                print("ERROR: web.archive.org 二重書き")
                Exception
    return newline

def replace_page(pagetitle):
    site = pywikibot.Site()
    page = pywikibot.Page(site, pagetitle)
    #text = page.text
    #print(text)
    linelist = page.text.split('\n')
    #print(linelist)

    comment = ""
    gaibu = 0
    modflag = 0
    outtext = ""
    for line in linelist:
        # archiveurlの行は二重書き換えが発生しやすいので除外する 2020.11.19
        tmp_re = target_re + ".*" + target_re
        if ( re.search("[Ww]ayback",line) or
            re.search("archiveurl", line) or
            re.search(tmp_re, line) or
            re.search("[wW]eb[aA]rchive", line) ):
            outtext += line + "\n"
            continue
        #print(gaibu,line)
        if target in line:
            newline = make_newline( line )
            if newline != "":
                if line != newline:
                    line = newline
                    comment = newline
                    print(gaibu,line)
                    modflag = 1
        outtext += line + "\n"

    if modflag == 1:
        page.text = outtext
        if procmode == 1:
            #page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> web.archive.org (" + target + ") ([[Wikipedia:Bot|Bot]]による編集)")
            page.save("外部リンクの修正 http:// -> web.archive.org (" + get_domain(target) + ") ([[Wikipedia:Bot|Bot]]による編集)")

# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
    path = "list"
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line)
            #if not re.search(",sumi", s_line):
            if not s_line.endswith(",sumi"):
                return s_line
    return ""

# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
    path = "list"
    alltext = ""
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line + "\n")
            #if re.search(pagetitle, s_line):
            if pagetitle == s_line:
                s_line = s_line + ",sumi"
            alltext += s_line + "\n"
    with open(path, mode='w') as f:
        f.write(alltext)
    return ""

def sub():
    num = 0
    for i in range(max):
        num = num + 1
        pagetitle = get_pagetitle()
        print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle)
        if pagetitle == "":
            break
        replace_page(pagetitle)
        done_pagetitle(pagetitle)
        
        if ( i < (max - 1) ):
            print("sleep(" + str(sleepsec) + ")")
            time.sleep(sleepsec)

def main():
    sub()
    print("done.")

if __name__ == '__main__':
    main()