利用者:Bcxfubot/BOT作業依頼/log/20210223-2/prog

# [orig] request.py
# URL張替
# https://fight.abematimes.com/posts/
# ↓
# https://times.abema.tv/fight/news-article/

import re
import time
import pywikibot
import requests
from requests.exceptions import Timeout
from urllib.parse import urlparse

target_re = "https:\/\/fight\.abematimes\.com\/posts\/"
replace_mae = "https://fight.abematimes.com/posts/"
replace_ato = "https://times.abema.tv/fight/news-article/"
#max = 10
#max = 10
max = 120
sleepsec = 60

######################################################
# 処理モード
#procmode = 0
procmode = 1
######################################################

def get_domain(target):
    url = ""
    result = re.search ( "(http[^ 　]+)", target)
    if result:
        url = result.group(1)
    else:
        return target
    parsed_uri = urlparse(url )
    result = '{uri.netloc}'.format(uri=parsed_uri)
    return result

# 指定されたURLの最終的なリダイレクト先を返す。
# エラーの場合は""を返す
def get_final_url(url):
    #response = requests.head(url)
    #ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'
    headers = {'User-Agent': ua}
    try:
        # PDFは重いのでheadにする 2020.1.13
        if ".pdf" in url:
            #response = requests.head(url,timeout=5.0)
            response = requests.head(url,headers=headers,timeout=5.0)
        else:
            #response = requests.get(url,timeout=5.0)
            response = requests.get(url,headers=headers,timeout=5.0)
    except Timeout:
        print("ERROR: timeout")
        raise
        return ""
    except Exception as e:
        print("ERROR: Exception")
        print(e)
        raise
        return ""
    print(response.status_code)
    print(response.url)
    #print(response.headers)
    if response.status_code != 200:
        return ""
    return response.url

# 指定されたhttpリンクがhttpsで同じパスで200ならfinalurlを返す
def check_200_https(url):
    print( "url=" + url )
    #httpsurl = url.replace( "http:" , "https:")
    httpsurl = url.replace( replace_mae, replace_ato)
    print( "httpsurl=" + httpsurl )
    finalurl = get_final_url( httpsurl ) 
    print( "finalurl=" + finalurl )
    if finalurl != "":
        if finalurl == httpsurl:
            if url != finalurl:
                return finalurl
    return ""


# ページを置換する
def replace_page(pagetitle):
    site = pywikibot.Site()
    page = pywikibot.Page(site, pagetitle)
    #text = page.text
    #print(text)
    linelist = page.text.split('\n')
    #print(linelist)

    comment_target_https = ""
    gaibu = 0
    modflag = 0
    outtext = ""
    for line in linelist:
        if (#re.search("<ref",line) or
            #re.search("ref>",line) or
            re.search("web.archive.org",line) or
            re.search("Archive.today",line) or
            re.search("archiveurl",line) or
            re.search("Wayback",line) ):
            outtext += line + "\n"
            continue
        #print(gaibu,line)
        #if "==外部リンク" in line:
        #if re.search("==[ ]*外部リンク",line):
        #    gaibu = 1
        #if gaibu == 1:
        #if target in line:
        if re.search(target_re,line):
            #print(gaibu,line)
            #line = line.replace( target, target_https)
            #print(gaibu,line)
            #pattern = r"http://[^ 　\t\|\]\}<\)]+"
            #pattern = r"http://[^ 　\t\|\]\}<>\)]+"
            pattern = r"https://[^ 　\t\|\]\}<>\)]+"
            matchedlist = re.findall( pattern, line)
            if matchedlist:
                for url in matchedlist:
                    #if re.search("^" + target_re, url):

                    # コメントの切れ端が入る場合があるのでここで弾く 2020.8.8
                    if "--" in url:
                        continue

                    if target_re[0] == "^":
                        tmp_re = target_re
                    else:
                        tmp_re = "^" + target_re
                    print("tmp_re=" + tmp_re)
                    if re.search(tmp_re, url):
                        finalurl = check_200_https(url)
                        if finalurl != "" and finalurl != url:
                            line = line.replace( url, finalurl)
                            comment_target_https = comment_target_https + finalurl
                            print(gaibu,line)
                            modflag = 1
                        
        outtext += line + "\n"

    if modflag == 1:
        # 異常チェック：元のpage.textと新しいouttextの違いはhttpsだけなので
        # たかだか10バイトぐらいしか違わないはずである。
        # それ以上違う場合はなにかしらおかしいのでエラーとする
        difflen = len(outtext) - len(page.text)
        print("difflen=" + str(difflen))
        if ( ( difflen < -10 ) or
             ( difflen > 50 ) ):
            raise Exception
            exit(2)
        page.text = outtext
        #print(page.text)
        if procmode == 1:
            #page.save("外部リンクの修正 " + comment_target_https + " ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> https:// (" + get_domain( target_re.replace("\\","") ) + ") ([[Wikipedia:Bot|Bot]]による編集)")
            page.save("[[Wikipedia:Bot作業依頼#「Abema_格闘TIMES」のリンク切れを修正]] ([[Wikipedia:Bot|Bot]]による編集)")

# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
    path = "list"
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line)
            #if not re.search(",sumi", s_line):
            if not s_line.endswith(",sumi"):
                return s_line
    return ""

# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
    path = "list"
    alltext = ""
    with open(path) as f:
        for s_line in f:
            s_line = s_line.rstrip("\n")
            #print(s_line + "\n")
            #if re.search(pagetitle, s_line):
            if pagetitle == s_line:
                s_line = s_line + ",sumi"
            alltext += s_line + "\n"
    with open(path, mode='w') as f:
        f.write(alltext)
    return ""

def sub():
    num = 0
    for i in range(max):
        num = num + 1
        pagetitle = get_pagetitle()
        print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle)
        if pagetitle == "":
            break
        replace_page(pagetitle)
        done_pagetitle(pagetitle)
        
        if ( i < (max - 1) ):
            print("sleep(" + str(sleepsec) + ")")
            time.sleep(sleepsec)

def main():
    sub()
    print("done.")

if __name__ == '__main__':
    main()