import sys
import os
import re
import urllib2
def normalize(name):
name = re.sub("<[^>]*>", "", name)
name = re.sub("[*+]", "", name)
return name
seen = set()
def parseurl(url, download = True):
res = urllib2.urlopen(url)
page = res.read()
lines = page.split("\n")
img_patterns = ['"(http://[^\\"]*\\.JPEG)"',
'"(http://[^\\"]*\\.JPG)"',
'"(http://[^\\"]*\\.GIF)"',
'"(http://[^\\"]*\\.PNG)"'
]
usersearch = '(.*)'
sigstop = '__________________'
current_user = ""
for line in lines:
images = []
for pattern in img_patterns:
images += re.findall(pattern, line, re.IGNORECASE)
users = re.findall(usersearch, line)
if users:
current_user = normalize(users[-1])
if sigstop in line:
current_user = ""
for image in images:
if current_user and image not in seen and "smogon.com" not in image:
seen.add(image)
if "?" in image:
print "Fetch manually: ", current_user, image
continue
if not download:
continue
try:
os.mkdir(current_user)
except OSError:
pass
os.chdir(current_user)
print current_user, image
try:
image_contents = urllib2.urlopen(image).read()
except:
os.chdir("..")
continue
name = image.split("/")[-1]
open(name, 'w').write(image_contents)
os.chdir("..")
if len(sys.argv) < 3:
print "Usage: %s [ default: 0]" % sys.argv[0]
sys.exit(0)
start_page = int(sys.argv[1])
end_page = int(sys.argv[2])
head_start = int(sys.argv[3]) if len(sys.argv) > 3 else 0
for page_num in xrange(start_page - head_start, start_page):
print "processing page #%i" % page_num
parseurl("http://www.smogon.com/forums/showthread.php?t=10601&page=%i" % page_num, False)
for page_num in xrange(start_page, end_page + 1):
print "processing page #%i" % page_num
parseurl("http://www.smogon.com/forums/showthread.php?t=10601&page=%i" % page_num)