A small snippet of code to get the National Geographic Photo of the Day.
It uses mechanize and beautiful soup 4 to help with the scraping.
It also has a function to allow walking backwards to pickup previous photos.
# -*- coding: utf-8 -*- import os import re import sys import mechanize from bs4 import BeautifulSoup MASTER_URL = "http://photography.nationalgeographic.com/photography/photo-of-the-day/?source=NavPhoPOD" class POD_Browser(mechanize.Browser): """ A browser for pod, with our configuration settings. """ def __init__(self, *args, **kwargs): mechanize.Browser.__init__(self, *args, **kwargs) self.set_handle_robots(False) self.set_debug_redirects(False) self.set_debug_http(False) self.set_handle_equiv(True) self.set_handle_gzip(True) self.set_handle_redirect(True) self.open(MASTER_URL) class Session(object): """ A Session. """ def __init__(self): self.browser = POD_Browser() def downloadPhotoOfTheDay(self): """ Search the page looking for a Wallpaper link. Not all pages have Wallpaper link, and we (politely) don't download the image in that case. """ page = self.browser.response().read() soup = BeautifulSoup(page) tags = soup.find_all('a', text = re.compile ('Download Wallpaper')) for t in tags: filename = t['href'].split('/')[-1] # If we've already downloaded the file, don't download it again. if not os.path.exists(filename): filename, _headers = self.browser.retrieve(t['href'], filename ) print filename sys.stdout.flush() def goPrevious(self): """ Find the previous link and go back a day """ link = self.browser.find_link(text_regex = re.compile('Previous')) self.browser.follow_link(link) def downloadPriorPhotos(self, start = 1, num = 10): """ Download some prior photos. You can start 7 days back and get 14 days of photos with start = 7, num = 14 :param start: How many days to go back before starting :param num: How many days to look at. """ for x in xrange(start): self.goPrevious() for x in xrange(num): self.downloadPhotoOfTheDay() self.goPrevious() if __name__ == '__main__': s = Session() s.downloadPhotoOfTheDay() # If you want to get the last week's worth uncomment this instead. #s.downloadPriorPhotos(num = 7)
No comments:
Post a Comment