利用者:Bcxfubot/BOT作業依頼/log/20210223-2/prog
表示
# [orig] request.py
# URL張替
# https://fight.abematimes.com/posts/
# ↓
# https://times.abema.tv/fight/news-article/
import re
import time
import pywikibot
import requests
from requests.exceptions import Timeout
from urllib.parse import urlparse
target_re = "https:\/\/fight\.abematimes\.com\/posts\/"
replace_mae = "https://fight.abematimes.com/posts/"
replace_ato = "https://times.abema.tv/fight/news-article/"
#max = 10
#max = 10
max = 120
sleepsec = 60
######################################################
# 処理モード
#procmode = 0
procmode = 1
######################################################
def get_domain(target):
url = ""
result = re.search ( "(http[^ ]+)", target)
if result:
url = result.group(1)
else:
return target
parsed_uri = urlparse(url )
result = '{uri.netloc}'.format(uri=parsed_uri)
return result
# 指定されたURLの最終的なリダイレクト先を返す。
# エラーの場合は""を返す
def get_final_url(url):
#response = requests.head(url)
#ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'
headers = {'User-Agent': ua}
try:
# PDFは重いのでheadにする 2020.1.13
if ".pdf" in url:
#response = requests.head(url,timeout=5.0)
response = requests.head(url,headers=headers,timeout=5.0)
else:
#response = requests.get(url,timeout=5.0)
response = requests.get(url,headers=headers,timeout=5.0)
except Timeout:
print("ERROR: timeout")
raise
return ""
except Exception as e:
print("ERROR: Exception")
print(e)
raise
return ""
print(response.status_code)
print(response.url)
#print(response.headers)
if response.status_code != 200:
return ""
return response.url
# 指定されたhttpリンクがhttpsで同じパスで200ならfinalurlを返す
def check_200_https(url):
print( "url=" + url )
#httpsurl = url.replace( "http:" , "https:")
httpsurl = url.replace( replace_mae, replace_ato)
print( "httpsurl=" + httpsurl )
finalurl = get_final_url( httpsurl )
print( "finalurl=" + finalurl )
if finalurl != "":
if finalurl == httpsurl:
if url != finalurl:
return finalurl
return ""
# ページを置換する
def replace_page(pagetitle):
site = pywikibot.Site()
page = pywikibot.Page(site, pagetitle)
#text = page.text
#print(text)
linelist = page.text.split('\n')
#print(linelist)
comment_target_https = ""
gaibu = 0
modflag = 0
outtext = ""
for line in linelist:
if (#re.search("<ref",line) or
#re.search("ref>",line) or
re.search("web.archive.org",line) or
re.search("Archive.today",line) or
re.search("archiveurl",line) or
re.search("Wayback",line) ):
outtext += line + "\n"
continue
#print(gaibu,line)
#if "==外部リンク" in line:
#if re.search("==[ ]*外部リンク",line):
# gaibu = 1
#if gaibu == 1:
#if target in line:
if re.search(target_re,line):
#print(gaibu,line)
#line = line.replace( target, target_https)
#print(gaibu,line)
#pattern = r"http://[^ \t\|\]\}<\)]+"
#pattern = r"http://[^ \t\|\]\}<>\)]+"
pattern = r"https://[^ \t\|\]\}<>\)]+"
matchedlist = re.findall( pattern, line)
if matchedlist:
for url in matchedlist:
#if re.search("^" + target_re, url):
# コメントの切れ端が入る場合があるのでここで弾く 2020.8.8
if "--" in url:
continue
if target_re[0] == "^":
tmp_re = target_re
else:
tmp_re = "^" + target_re
print("tmp_re=" + tmp_re)
if re.search(tmp_re, url):
finalurl = check_200_https(url)
if finalurl != "" and finalurl != url:
line = line.replace( url, finalurl)
comment_target_https = comment_target_https + finalurl
print(gaibu,line)
modflag = 1
outtext += line + "\n"
if modflag == 1:
# 異常チェック:元のpage.textと新しいouttextの違いはhttpsだけなので
# たかだか10バイトぐらいしか違わないはずである。
# それ以上違う場合はなにかしらおかしいのでエラーとする
difflen = len(outtext) - len(page.text)
print("difflen=" + str(difflen))
if ( ( difflen < -10 ) or
( difflen > 50 ) ):
raise Exception
exit(2)
page.text = outtext
#print(page.text)
if procmode == 1:
#page.save("外部リンクの修正 " + comment_target_https + " ([[Wikipedia:Bot|Bot]]による編集)")
#page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
#page.save("外部リンクの修正 http:// -> https:// (" + get_domain( target_re.replace("\\","") ) + ") ([[Wikipedia:Bot|Bot]]による編集)")
page.save("[[Wikipedia:Bot作業依頼#「Abema_格闘TIMES」のリンク切れを修正]] ([[Wikipedia:Bot|Bot]]による編集)")
# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
path = "list"
with open(path) as f:
for s_line in f:
s_line = s_line.rstrip("\n")
#print(s_line)
#if not re.search(",sumi", s_line):
if not s_line.endswith(",sumi"):
return s_line
return ""
# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
path = "list"
alltext = ""
with open(path) as f:
for s_line in f:
s_line = s_line.rstrip("\n")
#print(s_line + "\n")
#if re.search(pagetitle, s_line):
if pagetitle == s_line:
s_line = s_line + ",sumi"
alltext += s_line + "\n"
with open(path, mode='w') as f:
f.write(alltext)
return ""
def sub():
num = 0
for i in range(max):
num = num + 1
pagetitle = get_pagetitle()
print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle)
if pagetitle == "":
break
replace_page(pagetitle)
done_pagetitle(pagetitle)
if ( i < (max - 1) ):
print("sleep(" + str(sleepsec) + ")")
time.sleep(sleepsec)
def main():
sub()
print("done.")
if __name__ == '__main__':
main()