WikiPDF

TCS IT Wiz is coming up (the Kolkata edition’s on the 9th) and that means a lot of research work, especially from Wikipedia. So, I wrote up a quick tool to export Wikipedia pages as PDFs. First, install Python 2.7 from here and download the following script:

import requests
import bs4
import io

ADD_URI = 'http://en.wikipedia.org/w/index.php?title=Special:Book&bookcmd=add_article&arttitle={0}&oldid=0'
BOOK_URI = 'http://en.wikipedia.org/wiki/Special:Book'


def wikify(topic):
    return '+'.join(topic.split(' '))

def main(args):
    print '[ Wikipdf ]'
    print '(C) 2012 Aviral Dasgupta - www.aviraldg.com'
    if len(args) < 2:
        print 'Usage: {0} <path.to.page.list>'.format(args[0])
        return

    with io.open(args[1], 'r') as page_list:
        topics = map(lambda t: t.strip(), page_list.readlines())

    print 'Downloading: ' + ', '.join(topics) + '.'
    print '({0} pages; this may take a while)'.format(len(topics))

    s = requests.session()

    for topic in topics:
        print 'Adding {0}'.format(wikify(topic))
        s.get(ADD_URI.format(wikify(topic)))

    print 'Done.'
    p = s.post(BOOK_URI, {'formatSelect': 'rl', 'bookcmd': 'render'})

    print 'Rendering...'
    print 'Render progress page is at {0}'.format(p.url)
    print 'You can check manually or allow Wikipdf to check and download.'
    opt = raw_input('Open web browser with progress page url? [Y/N]')
    if opt.lower().strip().startswith('y'):
        import webbrowser
        webbrowser.open(p.url)
        return

    t = 1
    while True:
        import time
        time.sleep(t)
        print 'Checking ({0}) ...'.format(t)
        t += 1
        p = s.get(p.url)
        if 'finished' in p.text:
            break

    print 'Rendering done; downloading.'
    soup = bs4.BeautifulSoup(p.content)
    pdfuri = soup.find_all('a')[3]['href']

    content = s.get(pdfuri).content

    with io.open(args[1] + '.pdf', 'wb') as out:
        out.write(content)


if __name__ == '__main__':
    import sys
    main(sys.argv)

Then, install PIP by downloading and running https://raw.github.com/pypa/pip/master/contrib/get-pip.py and execute the following commands:

aviraldg@aviraldg-netbook:~$ pip install beautifulsoup4
aviraldg@aviraldg-netbook:~$ pip install requests

Now, just create a text file which lists all the topics you want to download, each on a new line, and drag this file on top of the wikipdf script (or execute it from the terminal with the list file as an argument.)

Comment if you have problems or want to offer feedback.