| 1 : |
dantman |
3 |
#!/usr/bin/env python
|
| 2 : |
|
|
# -*- coding: utf-8 -*-
|
| 3 : |
|
|
"""
|
| 4 : |
|
|
This bot is used to Export pages from Wikipedia, alter them, then Import them to another wiki.
|
| 5 : |
|
|
Info: http://en.anime.wikia.com/wiki/Project:Bots/ExportImport
|
| 6 : |
|
|
"""
|
| 7 : |
|
|
|
| 8 : |
|
|
import sys, re
|
| 9 : |
|
|
import wikipedia, pagegenerators, catlib, config
|
| 10 : |
|
|
from time import *
|
| 11 : |
|
|
import xml
|
| 12 : |
|
|
import xml.dom.minidom as minidom
|
| 13 : |
|
|
from xml.dom.minidom import Node
|
| 14 : |
|
|
|
| 15 : |
|
|
class GEExport:
|
| 16 : |
|
|
def __init__(self, pageGenerator):
|
| 17 : |
|
|
self.pageGenerator = pageGenerator
|
| 18 : |
|
|
|
| 19 : |
|
|
def exportPage(self, page):
|
| 20 : |
|
|
response = None
|
| 21 : |
|
|
data = None
|
| 22 : |
|
|
wp = wikipedia.getSite(code=u'en', fam=u'wikipedia')
|
| 23 : |
|
|
address = wp.export_address()
|
| 24 : |
|
|
title = page.sectionFreeTitle().encode(wp.encoding())
|
| 25 : |
|
|
predata = {
|
| 26 : |
|
|
'action': 'submit',
|
| 27 : |
|
|
'pages': title,
|
| 28 : |
|
|
'offset': '1',
|
| 29 : |
|
|
}
|
| 30 : |
|
|
#if True is True:#Future Loop marker
|
| 31 : |
|
|
while True:
|
| 32 : |
|
|
wikipedia.get_throttle()
|
| 33 : |
|
|
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}')
|
| 34 : |
|
|
# Now make the actual request to the server
|
| 35 : |
|
|
now = time()
|
| 36 : |
|
|
if wp.hostname() in config.authenticate.keys():
|
| 37 : |
|
|
predata["Content-type"] = "application/x-www-form-urlencoded"
|
| 38 : |
|
|
predata["User-agent"] = wikipedia.useragent
|
| 39 : |
|
|
data = wp.urlEncode(predata)
|
| 40 : |
|
|
response = urllib2.urlopen(urllib2.Request(wp.protocol() + '://' + wp.hostname() + address, data))
|
| 41 : |
|
|
data = response.read()
|
| 42 : |
|
|
else:
|
| 43 : |
|
|
response, data = wp.postForm(address, predata)
|
| 44 : |
|
|
data = data.encode(wp.encoding())
|
| 45 : |
|
|
wikipedia.get_throttle.setDelay(time() - now)
|
| 46 : |
|
|
|
| 47 : |
|
|
doc = minidom.parseString(data)
|
| 48 : |
|
|
revs = doc.getElementsByTagName('revision')
|
| 49 : |
|
|
revCount = len(revs)
|
| 50 : |
|
|
if revCount > 0:
|
| 51 : |
|
|
lastRev = revs[len(revs)-1].getElementsByTagName('timestamp')[0]
|
| 52 : |
|
|
timestamp = ''
|
| 53 : |
|
|
for nodes in lastRev.childNodes:
|
| 54 : |
|
|
if nodes.nodeType == Node.TEXT_NODE:
|
| 55 : |
|
|
timestamp += nodes.data
|
| 56 : |
|
|
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}' % (revCount,timestamp))
|
| 57 : |
|
|
fileName = 'wpdumps/%s-%s.xml' % (title.replace('/','-'),predata['offset'].replace(':','-'))
|
| 58 : |
|
|
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}' % fileName)
|
| 59 : |
|
|
f = open(fileName, 'w')
|
| 60 : |
|
|
f.write(data)
|
| 61 : |
|
|
f.close()
|
| 62 : |
|
|
predata['offset'] = timestamp
|
| 63 : |
|
|
else:
|
| 64 : |
|
|
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}')
|
| 65 : |
|
|
break
|
| 66 : |
|
|
|
| 67 : |
|
|
def run(self):
|
| 68 : |
|
|
|
| 69 : |
|
|
wikipedia.output(u'\03{lightblue}Running Export bot.\03{default}')
|
| 70 : |
|
|
for page in self.pageGenerator:
|
| 71 : |
|
|
wikipedia.output('\03{lightpurple}>\03{default} \03{lightaqua}Doing \03{lightpurple}%s\03{default}' % page.aslink())
|
| 72 : |
|
|
self.exportPage(page)
|
| 73 : |
|
|
|
| 74 : |
|
|
class GEImport:
|
| 75 : |
|
|
def run(self):
|
| 76 : |
|
|
wikipedia.output(u'\03{lightblue}Running Import bot.\03{default}')
|
| 77 : |
|
|
|
| 78 : |
|
|
def main():
|
| 79 : |
|
|
bot = None
|
| 80 : |
|
|
action = None
|
| 81 : |
|
|
|
| 82 : |
|
|
# This factory is responsible for processing command line arguments
|
| 83 : |
|
|
# that are also used by other scripts and that determine on which pages
|
| 84 : |
|
|
# to work on.
|
| 85 : |
|
|
genFactory = pagegenerators.GeneratorFactory()
|
| 86 : |
|
|
gen = None
|
| 87 : |
|
|
|
| 88 : |
|
|
for arg in wikipedia.handleArgs():
|
| 89 : |
|
|
if action == None:
|
| 90 : |
|
|
action = arg
|
| 91 : |
|
|
else:
|
| 92 : |
|
|
generator = genFactory.handleArg(arg)
|
| 93 : |
|
|
if generator:
|
| 94 : |
|
|
gen = generator
|
| 95 : |
|
|
|
| 96 : |
|
|
if action == 'export':
|
| 97 : |
|
|
if gen == None:
|
| 98 : |
|
|
wikipedia.output(u'\03{lightred}Export bot needs a page generator to itterate over.\03{default}')
|
| 99 : |
|
|
return
|
| 100 : |
|
|
bot = GEExport(gen)
|
| 101 : |
|
|
elif action == 'import':
|
| 102 : |
|
|
bot = GEImport()
|
| 103 : |
|
|
if bot == None:
|
| 104 : |
|
|
wikipedia.output(u'\03{lightred}Invalid bot action to run.\03{default}')
|
| 105 : |
|
|
return
|
| 106 : |
|
|
bot.run()
|
| 107 : |
|
|
|
| 108 : |
|
|
if __name__ == "__main__":
|
| 109 : |
|
|
try:
|
| 110 : |
|
|
main()
|
| 111 : |
|
|
finally:
|
| 112 : |
|
|
wikipedia.stopme()
|