利用者:Bcxfubot/BOT作業依頼/log/20201122/nagoya12/prog
表示
# [orig] request.py # URL張替 # http://www.city.nagoya.jp/shiminkeizai/page/0000007718.html # ↓ # https://www.city.nagoya.jp/sportsshimin/page/0000007718.html import re import time import pywikibot import requests from requests.exceptions import Timeout from urllib.parse import urlparse target_re = "http:\/\/www\.city\.nagoya\.jp\/shiminkeizai\/page\/0000007718\.html" replace_mae = "http://www.city.nagoya.jp/shiminkeizai/" replace_ato = "https://www.city.nagoya.jp/sportsshimin/" #max = 10 #max = 10 max = 120 sleepsec = 60 ###################################################### # 処理モード #procmode = 0 procmode = 1 ###################################################### def get_domain(target): url = "" result = re.search ( "(http[^ ]+)", target) if result: url = result.group(1) else: return target parsed_uri = urlparse(url ) result = '{uri.netloc}'.format(uri=parsed_uri) return result # 指定されたURLの最終的なリダイレクト先を返す。 # エラーの場合は""を返す def get_final_url(url): #response = requests.head(url) try: # PDFは重いのでheadにする 2020.1.13 if ".pdf" in url: response = requests.head(url,timeout=5.0) else: response = requests.get(url,timeout=5.0) except Timeout: print("ERROR: timeout") raise return "" except Exception as e: print("ERROR: Exception") print(e) raise return "" print(response.status_code) print(response.url) #print(response.headers) if response.status_code != 200: return "" return response.url # 指定されたhttpリンクがhttpsで同じパスで200ならfinalurlを返す def check_200_https(url): print( "url=" + url ) #httpsurl = url.replace( "http:" , "https:") httpsurl = url.replace( replace_mae, replace_ato) print( "httpsurl=" + httpsurl ) finalurl = get_final_url( httpsurl ) print( "finalurl=" + finalurl ) if finalurl != "": if finalurl == httpsurl: if url != finalurl: return finalurl return "" # ページを置換する def replace_page(pagetitle): site = pywikibot.Site() page = pywikibot.Page(site, pagetitle) #text = page.text #print(text) linelist = page.text.split('\n') #print(linelist) comment_target_https = "" gaibu = 0 modflag = 0 outtext = "" for line in linelist: if (#re.search("<ref",line) or #re.search("ref>",line) or re.search("web.archive.org",line) or re.search("Archive.today",line) or re.search("Wayback",line) ): outtext += line + "\n" continue #print(gaibu,line) #if "==外部リンク" in line: #if re.search("==[ ]*外部リンク",line): # gaibu = 1 #if gaibu == 1: #if target in line: if re.search(target_re,line): #print(gaibu,line) #line = line.replace( target, target_https) #print(gaibu,line) #pattern = r"http://[^ \t\|\]\}<\)]+" pattern = r"http://[^ \t\|\]\}<>\)]+" matchedlist = re.findall( pattern, line) if matchedlist: for url in matchedlist: #if re.search("^" + target_re, url): # コメントの切れ端が入る場合があるのでここで弾く 2020.8.8 if "--" in url: continue if target_re[0] == "^": tmp_re = target_re else: tmp_re = "^" + target_re print("tmp_re=" + tmp_re) if re.search(tmp_re, url): finalurl = check_200_https(url) if finalurl != "" and finalurl != url: line = line.replace( url, finalurl) comment_target_https = comment_target_https + finalurl print(gaibu,line) modflag = 1 outtext += line + "\n" if modflag == 1: # 異常チェック:元のpage.textと新しいouttextの違いはhttpsだけなので # たかだか10バイトぐらいしか違わないはずである。 # それ以上違う場合はなにかしらおかしいのでエラーとする difflen = len(outtext) - len(page.text) print("difflen=" + str(difflen)) if ( ( difflen < -10 ) or ( difflen > 50 ) ): raise Exception exit(2) page.text = outtext #print(page.text) if procmode == 1: #page.save("外部リンクの修正 " + comment_target_https + " ([[Wikipedia:Bot|Bot]]による編集)") #page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)") #page.save("外部リンクの修正 http:// -> https:// (" + get_domain( target_re.replace("\\","") ) + ") ([[Wikipedia:Bot|Bot]]による編集)") page.save("[[Wikipedia:Bot作業依頼#名古屋市公式サイトのリンク置換依頼]] ([[Wikipedia:Bot|Bot]]による編集)") # 処理対象のページ名をひとつ返す # 処理対象がない場合は""を返す def get_pagetitle(): path = "list" with open(path) as f: for s_line in f: s_line = s_line.rstrip("\n") #print(s_line) #if not re.search(",sumi", s_line): if not s_line.endswith(",sumi"): return s_line return "" # 処理した行にsumiをつける def done_pagetitle(pagetitle): path = "list" alltext = "" with open(path) as f: for s_line in f: s_line = s_line.rstrip("\n") #print(s_line + "\n") #if re.search(pagetitle, s_line): if pagetitle == s_line: s_line = s_line + ",sumi" alltext += s_line + "\n" with open(path, mode='w') as f: f.write(alltext) return "" def sub(): num = 0 for i in range(max): num = num + 1 pagetitle = get_pagetitle() print("[" + str(num) + "/" + str(max) + "]" + ":" + "pagetitle=" + pagetitle) if pagetitle == "": break replace_page(pagetitle) done_pagetitle(pagetitle) if ( i < (max - 1) ): print("sleep(" + str(sleepsec) + ")") time.sleep(sleepsec) def main(): sub() print("done.") if __name__ == '__main__': main()