利用者:Bcxfubot/BOT作業依頼/log/20210323/prog2
表示
# [orig] first_archive.py
# first_archive.pyは、新聞ニュース記事などで使う。最初のアーカイブが重要なやつ。
# [http://www.blog.com/blog.html ページタイトル]
# ↓
# {{Wayback|url=http://www.blog.com/blog.html |title=ページタイトル}}
# 一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
# → 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13
import re
import time
import pywikibot
import urllib.parse
import requests
from urllib.parse import urlparse
from requests.exceptions import Timeout
target = "http://www.ntv.co.jp/kinro/lineup/"
target_re = "http:\/\/www\.ntv\.co\.jp\/kinro\/lineup\/"
#max = 10
#max = 10
#max = 120
max = 120
sleepsec = 60
######################################################
# 処理モード
#procmode = 0
procmode = 1
######################################################
def get_domain(target):
url = ""
result = re.search ( "(http[^ ]+)", target)
if result:
url = result.group(1)
else:
return target
parsed_uri = urlparse(url )
result = '{uri.netloc}'.format(uri=parsed_uri)
return result
def get_date_core(origurl):
encoded_url = urllib.parse.quote(origurl, safe="")
print("encoded_url = "+ encoded_url)
#spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url + "&collection=web&output=json"
api_url = "https://archive.org/wayback/available?url=" + encoded_url + "×tamp=20010101"
#print("spark_url = "+ spark_url)
print("api_url = "+ api_url)
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
headers = {'User-Agent': ua}
try:
#response = requests.get(spark_url,timeout=60.0)
response = requests.get(api_url,timeout=60.0,headers=headers)
except Timeout:
print("ERROR: timeout")
raise
return ""
except Exception as e:
print("ERROR: Exception")
print(e)
raise
return ""
print("response.text = " + response.text)
#if "\"first_ts\":null" in response.text:
# return ""
if "504 Gateway Time-out" in response.text:
return ""
data = response.json()
print(data)
#lastdate = data["last_ts"]
#print(lastdate)
#return lastdate
#firstdate = data["first_ts"]
try:
firstdate = data["archived_snapshots"]["closest"]["timestamp"]
except Exception as e:
firstdate = ""
print(firstdate)
return firstdate
def get_date(origurl):
result = ""
for i in range(3):
result = get_date_core(origurl)
if result != "":
break
print("sleep(30)")
time.sleep(30)
return result
def make_newline( origline ):
#print("make_newline(): origline=" + origline)
newline = ""
#result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
#result = re.search( target_re , origline )
result = re.search( "^(.*)\[(" + target_re + "[^ ]*)[ ]+([^\]]*)\](.*)$", origline)
if result:
pre = result.group(1)
print("pre="+pre)
origurl = result.group(2)
origtext = result.group(3)
post = result.group(4)
print("origurl = " + origurl)
print("origtext = " + origtext)
# origtextに[[]]内部リンクがある場合はうまく処理できないのでここではじく2020.7.24
result2 = re.search( "\[\[", origtext)
if result2:
return origline
date = get_date( origurl )
#if date == "":
# return ""
if date == "":
date = "*"
print("date = " + date)
origtext = origtext.replace("|", "|")
newline = pre + "{{Wayback|url=" + origurl + " |title=" + origtext + " |date=" + date + "}}" + post
print("newline = " + newline)
return newline
def replace_page(pagetitle):
site = pywikibot.Site()
page = pywikibot.Page(site, pagetitle)
#text = page.text
#print(text)
linelist = page.text.split('\n')
#print(linelist)
comment = ""
gaibu = 0
modflag = 0
outtext = ""
for line in linelist:
if (re.search("<ref",line) or
re.search("ref>",line) or
re.search("web.archive.org",line) or
re.search("Wayback",line) ):
outtext += line + "\n"
continue
#print(gaibu,line)
result = re.search( "^\*", line)
if result:
if target in line:
newline = make_newline( line )
if newline != "":
if line != newline:
line = newline
comment = newline
print(gaibu,line)
modflag = 1
outtext += line + "\n"
if modflag == 1:
page.text = outtext
if procmode == 1:
#page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
#page.save("外部リンクの修正 http:// -> {{Wayback}} ([[Wikipedia:Bot|Bot]]による編集)")
#page.save("外部リンクの修正 http:// -> {{Wayback}} (" + get_domain( target_re.replace("\\","") ) + ") ([[Wikipedia:Bot|Bot]]による編集)")
page.save("[[Wikipedia:Bot作業依頼#金曜ロードSHOW!のリンク切れをウェイバックマシンに置換]] ([[Wikipedia:Bot|Bot]]による編集)")
# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
path = "list"
with open(path) as f:
for s_line in f:
s_line = s_line.rstrip("\n")
#print(s_line)
#if not re.search(",sumi", s_line):
if not s_line.endswith(",sumi"):
return s_line
return ""
# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
path = "list"
alltext = ""
with open(path) as f:
for s_line in f:
s_line = s_line.rstrip("\n")
#print(s_line + "\n")
#if re.search(pagetitle, s_line):
if pagetitle == s_line:
s_line = s_line + ",sumi"
alltext += s_line + "\n"
with open(path, mode='w') as f:
f.write(alltext)
return ""
def sub():
num = 0
for i in range(max):
num = num + 1
pagetitle = get_pagetitle()
print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle)
if pagetitle == "":
break
replace_page(pagetitle)
done_pagetitle(pagetitle)
if ( i < (max - 1) ):
print("sleep(" + str(sleepsec) + ")")
time.sleep(sleepsec)
def main():
sub()
print("done.")
if __name__ == '__main__':
main()