Wednesday, February 06, 2013

Darpa funds python development for big data

DARPA funds python development for big data

Friday, January 11, 2013

National Geographic Photo of the Day using Python

A small snippet of code to get the National Geographic Photo of the Day.
It uses mechanize and beautiful soup 4 to help with the scraping.

It also has a function to allow walking backwards to pickup previous photos.

# -*- coding: utf-8 -*-
import os
import re
import sys

import mechanize
from bs4 import BeautifulSoup

MASTER_URL = "http://photography.nationalgeographic.com/photography/photo-of-the-day/?source=NavPhoPOD"

class POD_Browser(mechanize.Browser):
    """
    A browser for pod, with our configuration settings.
    """
    def __init__(self, *args, **kwargs):
        mechanize.Browser.__init__(self, *args, **kwargs)
        self.set_handle_robots(False)
        self.set_debug_redirects(False)
        self.set_debug_http(False)
        self.set_handle_equiv(True)
        self.set_handle_gzip(True)
        self.set_handle_redirect(True)
        self.open(MASTER_URL)

class Session(object):
    """
    A Session.
    """
    def __init__(self):
        self.browser = POD_Browser()

    def downloadPhotoOfTheDay(self):
        """
        Search the page looking for a Wallpaper link. Not all pages have
        Wallpaper link, and we (politely) don't download the image in that
        case.
        """
        page = self.browser.response().read()
        soup = BeautifulSoup(page)

        tags = soup.find_all('a', text = re.compile ('Download Wallpaper'))
        for t in tags:
            filename = t['href'].split('/')[-1]
            # If we've already downloaded the file, don't download it again.
            if not os.path.exists(filename):
                filename, _headers = self.browser.retrieve(t['href'], filename )
                print filename
                sys.stdout.flush()

    def goPrevious(self):
        """
        Find the previous link and go back a day
        """
        link = self.browser.find_link(text_regex = re.compile('Previous'))
        self.browser.follow_link(link)
        
    def downloadPriorPhotos(self, start = 1, num = 10):
        """
        Download some prior photos.
        You can start 7 days back and get 14 days of photos with
        start = 7, num = 14

        :param start: How many days to go back before starting
        :param num: How many days to look at.
        
        """
        for x in xrange(start):
            self.goPrevious()

        for x in xrange(num):
            self.downloadPhotoOfTheDay()
            self.goPrevious()

if __name__ == '__main__':
    s = Session()
    s.downloadPhotoOfTheDay()
    # If you want to get the last week's worth uncomment this instead.
    #s.downloadPriorPhotos(num = 7)