コンテンツにスキップ

英文维基 | 中文维基 | 日文维基 | 草榴社区

利用者:Mizusumashi/Bot/villagepump.py

# -*- coding: utf-8 -*-
"""
This script is under public domain, and comes with ABSOLUTELY NO WARRANTY.
You can use/modify/redistribute without any permission.

このスクリプトの著作権は放棄され、このスクリプトはパブリックドメインにあります。
このスクリプトは完全に無保証です。このスクリプトの動作については全く何も保証されておらず、このスクリプトを使用したいかなる結果についても責任は負われません。

このスクリプトは、pywikipedia フレームワークを使用しています。

このスクリプトは、[[Wikipedia:井戸端]]から話題を除去し、サブページに複製します。
"""

import string, time, datetime, re
import wikipedia

#
# Utilities to escape title
#
ec = {
    ur'\t': ur' ',
    ur'#': ur'#',
    ur'<': ur'<',
    ur'>': ur'>',
    ur'[': ur'「',
    ur']': ur'」',
    ur'|': ur'|',
    ur'{': ur'{',
    ur'}': ur'}',
    ur'/': ur'/',
    ur'(': ur'(',
    ur')': ur')'
}

def escape(c):
    if not c:
        return ''

    return ec.get(c, c)

def comment(rest):
    while rest:
        m = re.match(r'-->(.*)', rest)
        if m:
            return m.group(1)

        rest = rest[1:]

    return ''

def nowiki(rest):
    inner = ''
    while rest:
        m = re.match(r'</\s*nowiki(?:\s+[^>]+)?>(.*)', rest)
        if m:
            return (inner, m.group(1))

        inner = inner + escape(rest[0])
        rest = rest[1:]

    return (inner, '')

def escapeTitle(s):
    r = ''
    while s:
        m = re.match(ur'<!--(.*)', s)
        if m:
            s = comment(m.group(1))
            continue

        m = re.match(ur'<\s*nowiki(?:\s+[^>]+)?/>(.*)', s)
        if m:
            s = m.group(1)
            continue

        m = re.match(ur'<\s*nowiki(?:\s+[^>]+)?>(.*)', s)
        if m:
            inner, rest = nowiki(m.group(1))
            r += inner
            s = rest
            continue

        m = re.match(ur'\[\[[^|]+\|([^]]+)\]\](.*)', s)
        if m:
            r += escapeTitle(m.group(1))
            s = m.group(2)
            continue

        m = re.match(ur'\[\[([^]]+)\]\](.*)', s)
        if m:
            r += escapeTitle(m.group(1))
            s = m.group(2)
            continue

        m = re.match(ur'\[\S+ ([^]]+)\](.*)', s)
        if m:
            r += escapeTitle(m.group(1))
            s = m.group(2)
            continue

        m = re.match(ur'\[([^]]+)\](.*)', s)
        if m:
            r += escapeTitle(m.group(1))
            s = m.group(2)
            continue

        m = re.match(ur"\'\'\'(.*)", s)
        if m:
            s = m.group(1)
            continue

        m = re.match(ur"\'\'(.*)", s)
        if m:
            s = m.group(1)
            continue
        
        r = r + escape(s[0])
        s = s[1:]

    return r

#
# Bot
#
class Error(Exception):
    """Error"""
    
class DateException(Exception):
    def __init__(self, msg):
        self.msg = msg
    def __str__(self):
        return 'Exception: ' + self.msg
    
class VillagePumpBot(object):
    def __init__(self,
                 target = u'Wikipedia:井戸端',
                 subpagePrefix = u'Wikipedia:井戸端/subj/',
                 transcludionTemplate = u'井戸端サブページ',
                 headerTemplate = u'井戸端サブページ/ヘッダ',
                 changedComment = u'ロボットによる変更: ${edittime}(UTC)版より更新。',
                 subpagedComment = u'「${subpagePrefix}」配下へ分割(サブページ化)。',
                 removedComment = u'タイムアウトしたサブページの読み込み解除。',
                 styledComment = u'スタイル調整。',
                 subpagecomment = u'ロボットによる作成: '
                                  u'[[${target}]] '
                                  u'${edittime}(UTC)版より分割(サブページ化)。',
                 firstEditTimeOut = 30,
                 lastEditTimeOut = 10,
                 test = False):
                 
        self.site = wikipedia.getSite()
        self.target = target
        self.subpagePrefix = subpagePrefix
        self.transcludionTemplate = transcludionTemplate
        self.headerTemplate = headerTemplate
        self.changedComment = string.Template(changedComment).substitute
        self.subpagedComment = string.Template(subpagedComment).substitute
        self.removedComment = removedComment
        self.styledComment = styledComment
        self.subpagecomment = string.Template(subpagecomment).substitute
        self.firstEditTimeOut = firstEditTimeOut
        self.lastEditTimeOut = lastEditTimeOut
        self.test = test
            
    def getSections(self, text):
        regex = r'^==(?!=)\s*(?P<title>\S(.*\S)?)\s*==(?!=)\s*' +\
                  r'^(?P<body>' +\
                    r'(.|\n(?!\s*(' +\
                      r'(^==(?!=)\s*\S(.*\S)?\s*==(?!=))' +\
                      r'|' +\
                      r'^({{\s*((T|t)emplate:)?' + re.escape(self.transcludionTemplate) + '[^}]*}})'\
                  r')))*' +\
                r')'
                
        result = []
        for s in re.finditer(regex, text, re.UNICODE | re.MULTILINE):
            result += [s]
            
        return result
        
    def getTranscludions(self, text):
        regex = r'{{\s*' +\
                r'((T|t)emplate:)?' + re.escape(self.transcludionTemplate) +\
                r'(?P<params>[^}]*)' +\
                r'}}'
                
        result = []
        for s in re.finditer(regex, text, re.UNICODE | re.MULTILINE):
            result += [s]
            
        return result
        
    def getParams(self, params):
        regex = r'(?P<key>[^\s|=]([^|=]*[^\s|=])?)' +\
                r'\s*=\s*' +\
                r'(?P<value>[^\s|=]([^|=]*[^\s|=])?)'
                
        result = {}
        for param in re.finditer(regex, params, re.UNICODE):
            result[param.group('key')] = param.group('value')
            
        return result
        
    def putPage(self, page, text, comment):
        if not self.test:
            page.put(text, comment)
        else:
            print '***TEST***'
            print 'title: >>' + page.title() + '<<'
            print 'comment: >>' + comment + '<<'
            print 'text: >>' + text + '<<'
            
    class UTC(datetime.tzinfo):
        def utcoffset(self, dt):
            return datetime.timedelta(0)
        def dst(self, dt):
            return datetime.timedelta(0)
        def tzname(self, dt):
            return "UTC"

        
    def getEditDate(self, page):
        regex = r'(?P<year>\d{4})' +\
                r'(?P<month>\d{2})' +\
                r'(?P<day>\d{2})' +\
                r'(?P<hour>\d{2})' +\
                r'(?P<min>\d{2})' +\
                r'(?P<sec>\d{2})'
                
        match = re.match(regex, page.editTime())

        return datetime.datetime(int(match.group('year')),
                                 int(match.group('month')),
                                 int(match.group('day')),
                                 int(match.group('hour')),
                                 int(match.group('min')),
                                 int(match.group('sec')),
                                 0, self.UTC())
        
    def getTagDate(self, text):
        regex = r'{{\s*' +\
                r'((T|t)emplate:)?' + re.escape(self.headerTemplate) +\
                r'\s*\|\s*date\s*=\s*' +\
                r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})\s+' +\
                r'(?P<hour>\d{1,2}):(?P<min>\d{1,2})' +\
                r'[^}]*}}'
                
        match = re.search(regex, text, re.UNICODE | re.MULTILINE)
        if not match:
            raise DateException('getTagDate')

        return datetime.datetime(int(match.group('year')),
                                 int(match.group('month')),
                                 int(match.group('day')),
                                 int(match.group('hour')),
                                 int(match.group('min')),
                                 0, 0, self.UTC())

    def sleep(self):
        throttle = wikipedia.put_throttle
        
        throttle.lock.acquire()
        try:
            waittime = throttle.waittime()
            if waittime > 1:
                time.sleep(waittime)
        finally:
            throttle.lock.release()
            
    def printEscapedTitle(self, text):
        wikipedia.output('Escaped Tiltes:')
        
        i = 0
        for match in self.getSections(text):
            title = match.group('title')
            wikipedia.output(' original tilte : ' + title)
            wikipedia.output(' escaped tilte  : ' + escapeTitle(title))
            i += 1

        if i == 0:
           wikipedia.output(" There is no section title to create subpage.")
        elif i == 1:
           wikipedia.output(" There is one title of section to create subpage.")
           wikipedia.output(" If escaped title is wrong, kill this process.")
        else:
           wikipedia.output(" There is some titles of sections to create subpages.")
           wikipedia.output(" If escaped title is wrong, kill this process.")

        wikipedia.input('Press any key to continue...')
            
    def removeIncludions(self, text):
        removed = False
        now = datetime.datetime.now(self.UTC())

        for match in self.getTranscludions(text):
            transcludion = match.group(0)
            params = self.getParams(match.group('params'))

            if 'subpage' in params:
                subpage = params['subpage']
            elif 'title' in params:
                subpage = self.subpagePrefix + params['title']
            else:
                wikipedia.output('ERROR: no params \'subpage\' or \'title\'.')
                continue
   
            page = wikipedia.Page(self.site, subpage)
            if not page.exists():
                wikipedia.output('ERROR: subpage does not exist: ' + subpage)
                continue

            lastEdit = self.getEditDate(page)
            if now < lastEdit + datetime.timedelta(self.lastEditTimeOut):
                try:
                    firstEdit = self.getTagDate(page.get())
                    if now < firstEdit + datetime.timedelta(self.firstEditTimeOut):
                        wikipedia.output('.')
                        continue
                except DateException:
                    wikipedia.output('Exception: tag\'s date does not match: ' + subpage)
                    continue

            wikipedia.output('Removing includion [[' + subpage + ']]...')
            text = re.sub(r'\s*' + re.escape(transcludion) + r'\s*', '\n', text)
            removed = True
            
        return text, removed
        
    def transToSubpages(self, text, edit):
        def put(subpage):
            self.sleep()
            page = wikipedia.Page(self.site, subpage)
            
            if not page.exists():
                self.putPage(page, body, comment)
                return True
            else:
                return False

        subpaged = False
        
        for match in self.getSections(text):
            section, title, body = match.group(0, 'title', 'body')
            body = '<noinclude>{{\n' +\
                   '  ' + self.headerTemplate + '\n' +\
                   '  | date = {{subst:#time:Y-m-d H:i:s}}\n' +\
                   '}}</noinclude>\n' +\
                   '== ' + title +  '==\n' +\
                   body
                   
            comment = self.subpagecomment(target = self.target,
                                          edittime = edit.strftime('%Y-%m-%d %H:%M:%S'))

            subpageMain = self.subpagePrefix + escapeTitle(title)
            subpagePostfix = ' ' + time.strftime(u"%Y%m%d", time.gmtime())
            subpage = subpageMain
            if not put(subpage):
                subpabe = subpageMain + subpagePostfix
                i = 0
                while not put(subpage):
                    subpabe = subpageMain + \
                              subpagePostfix + \
                              '%s-%d' % (postfix, i)
                    i += 1
                        
            if subpage == self.subpagePrefix + title:
                new = '{{ ' + self.transcludionTemplate + ' | title = ' + title + ' }}'
            else:
                wikipedia.output('title is changed: \'' \
                                 + title \
                                 + '\' -> \'' \
                                 + subpage \
                                 + '\'')
                new = '{{\n' +\
                      '  ' + self.transcludionTemplate + '\n' +\
                      '  | title = ' + title + '\n' +\
                      '  | subpage = ' + subpage + '\n' +\
                      '}}'
                      
            text = re.sub(r'\s*' + re.escape(section) + r'\s*', '\n' + new + '\n', text)
            subpaged = True

        return text, subpaged

    def doStyle(self, text):
        oldText = text
        
        for match in self.getTranscludions(text):
            old = match.group(0)
            new = old
            params = self.getParams(match.group('params'))
            
            if len(params) == 0:
                new = '{{ %s }}' % self.transcludionTemplate
            elif len(params) == 1:
                key, value = params.popitem()
                new = '{{ %s | %s = %s }}' \
                      % (self.transcludionTemplate, key, value)
            else:
                new = '{{\n  ' + self.transcludionTemplate + '\n'
                if 'title' in params:
                    new += '  | title = %s\n' % params['title']
                if 'subpage' in params:
                    new += '  | subpage = %s\n' % params['subpage']
                new += '}}'

            text = re.sub(r'\s*' + re.escape(old) + r'\s*',
                          '\n' + new + '\n',
                          text)
                
        return text, (oldText != text)
        
    def run(self):
        if self.test:
            wikipedia.output('***** TEST MODE ******')
            
        page = wikipedia.Page(self.site, self.target)
        text = page.get()
        edit = self.getEditDate(page)
        
        self.printEscapedTitle(text)

        text, removed = self.removeIncludions(text)
        text, subpaged = self.transToSubpages(text, edit)
        text, styled = self.doStyle(text)
        
        if (not removed) and (not subpaged) and (not styled):
            wikipedia.output("Not removed, Not subpaged, Not styled.")
            return
            
        comment = self.changedComment(edittime = edit.strftime('%Y-%m-%d %H:%M:%S'))
        if subpaged:
            comment += self.subpagedComment(subpagePrefix = self.subpagePrefix)
        if removed:
            comment += self.removedComment
        if styled:
            comment += self.styledComment

        self.putPage(page, text, comment)

#
# Main function for front end
#
class ParamError(Error):
    """Command line error"""

class ConfigNotExist(ParamError):
    def __init__(self, name):
        self.name = name
    def __str__(self):
        return 'Error: module does not exist: ' + self.name

class ConfigNotStr(ParamError):
    def __init__(self, param, module):
        self.param = param
        self.module = module
    def __str__(self):
        return 'Error: config parameter is not string: %s in %s' % (self.param, self.module.name)

class ConfigNotNum(ParamError):
    def __init__(self, param, module):
        self.param = param
        self.module = module
    def __str__(self):
        return 'Error: config parameter is not number: %s in %s' % (self.param, self.module.name)

class ConfigNotBool(ParamError):
    def __init__(self, param, module):
        self.param = param
        self.module = module
    def __str__(self):
        return 'Error: config parameter is not bool: %s in %s' % (self.param, self.module.name)

class ConfigUnknown(ParamError):
    def __init__(self, param, module):
        self.param = param
        self.module = module
    def __str__(self):
        return 'Error: config parameter is unknown: %s in &s' % (self.param, self.module.name)

class CommandLineParamDuplication(ParamError):
    def __init__(self, param):
        self.param = param
    def __str__(self):
        return 'Error: command line parameter is set twice: ' + self.param
        
class CommandLineParamNotNum(ParamError):
    def __init__(self, param):
        self.param = param
    def __str__(self):
        return 'Error: command line parameter is not number: ' + self.param

class CommandLineParamNotBool(ParamError):
    def __init__(self, param):
        self.param = param
    def __str__(self):
        return 'Error: command line parameter is not bool: ' + self.param

class CommandLineParamUnknown(ParamError):
    def __init__(self, param):
        self.param = param
    def __str__(self):
        return 'Error: command line parameter is unknown: ' + self.param

def main():
    params = {}

    def checkConfigs(name, ignoreNotExist):
        try:
            module = __import__(name)
            reload(module)
            configs = module.__dict__
        except ImportError:
            if ignoreNotExist:
                return
            else:
                raise ConfigNotExist(name)

        for key in configs.iterkeys():
            def checkConfigStr(param):
                if key == param:
                    if type(configs[key]) == str \
                       or type(configs[key]) == unicode:
                        params[key] = unicode(configs[key])
                        return True
                    else:
                        raise ConfigNotStr(config, module)
                else:
                    return False

            def checkConfigNum(param):
                if key == param:
                    if type(configs[key]) == int \
                       or type(configs[key]) == float:
                        params[key] = configs[key]
                        return True
                    else:
                        raise ConfigNotNun(config, module)
                else:
                    return False

            def checkConfigBool(param):
                if key == param:
                    if type(configs[key]) == bool:
                        params[key] = configs[key]
                        return True
                    else:
                        raise ConfigNotBool(config, module)
                else:
                    return False

            if checkConfigStr('target'): continue
            if checkConfigStr('subpagePrefix'): continue
            if checkConfigStr('transcludionTemplate'): continue
            if checkConfigStr('headerTemplate'): continue
            if checkConfigStr('changedComment'): continue
            if checkConfigStr('subpagedComment'): continue
            if checkConfigStr('removedComment'): continue
            if checkConfigStr('styledComment'): continue
            if checkConfigStr('subpagecomment'): continue
            if checkConfigStr('headerTemplate'): continue
            if checkConfigNum('firstEditTimeOut'): continue
            if checkConfigNum('lastEditTimeOut'): continue
            if checkConfigBool('test'): continue

    checkConfigs('villagepump_config', True)

    args = set()
    for arg in wikipedia.handleArgs():
        def checkArgConfig(param):
            if arg[:len(param) + 2] == '-' + param + ':':
                checkConfigs(arg[len(param) + 2:], False)
                return True
            else:
                return False

        def checkArgStr(param):
            if arg[:len(param) + 2] == '-' + param + ':':
                if param in args:
                    raise CommandLineParamDuplication(param)
                args.add(param)
                params[param] = unicode(arg[len(param) + 2:])
                return True
            else:
                return False

        def checkArgNum(param):
            if checkArgStr(param):
                try:
                    x = int(params[param])
                    params[param] = x
                    return True
                except ValueError:
                    pass

                try:
                    x = float(params[param])
                    params[param] = x
                    return True
                except ValueError:
                    pass
                
                raise CommandLineParamNotNum(param)
            else:
                return False

        def checkArgBool(param):
            if checkArgStr(param):
                if params[param] in [u'TRUE',
                                     u'True',
                                     u'true',
                                     u'ON',
                                     u'On',
                                     u'on',
                                     u'YES',
                                     u'Yes',
                                     u'yes'] :
                    params[param] = True
                    return True

                if params[param] in [u'False',
                                     u'false',
                                     u'OFF',
                                     u'Off',
                                     u'off',
                                     u'NO',
                                     u'No',
                                     u'no'] :
                    params[param] = False
                    return True

                raise CommandLineParamNotBool(param)
            else:
                return False

        if checkArgConfig('config'): continue
        if checkArgStr('target'): continue
        if checkArgStr('subpagePrefix'): continue
        if checkArgStr('transcludionTemplate'): continue
        if checkArgStr('headerTemplate'): continue
        if checkArgStr('changedComment'): continue
        if checkArgStr('subpagedComment'): continue
        if checkArgStr('removedComment'): continue
        if checkArgStr('styledComment'): continue
        if checkArgStr('subpagecomment'): continue
        if checkArgNum('firstEditTimeOut'): continue
        if checkArgNum('lastEditTimeOut'): continue
        if checkArgBool('test'): continue
        raise CommandLineParamUnknown(arg)

    bot = VillagePumpBot(**params)
    bot.run()

if __name__ == '__main__':
    try:
        main()
    except ParamError, e:
        wikipedia.output(str(e))
    finally:
        wikipedia.stopme()