Crowdsourcing
        with Django
           EuroPython, 30th June 2009
Simon Willison · http://simonwillison.net/ · @simonw
“Web development on
journalism deadlines”
The back story...
November 2000
The Freedom of Information Act
Heather Brooke

• http://www.guardian.co.uk/politics/
  2009/may/08/mps-expenses-telegraph-
  checquebook-journalism

• http://www.guardian.co.uk/politics/
  2009/may/15/mps-expenses-heather-
  brooke-foi
2004
The request
January 2005
 The FOI request
July 2006
The FOI commissioner
May 2007
The FOI (Amendment) Bill
February 2008
The Information Tribunal
“Transparency will
damage democracy”
May 2008
The high court
January 2009
The exemption law
March 2009
  The mole
“All of the receipts of 650-odd MPs,
redacted and unredacted, are for sale
 at a price of £300,000, so I am told.
 The price is going up because of the
         interest in the subject.”
                           Sir Stuart Bell, MP
                        Newsnight, 30th March
8th May, 2009
The Daily Telegraph
At the Guardian...
April: “Expenses are due out
  in a couple of months, is
there anything we can do?”
June: “Expenses have been
bumped forward, they’re out
        next week!”
Thursday 11th June
  The proof-of-concept
Monday 15th June
The tentative go-ahead
Tuesday 16th June
Designer + client-side engineer
Wednesday 17th June
   Operations engineer
Thursday 18th June
    Launch day!
How we built it
$ convert Frank_Comm.pdf pages.png
Models
class Party(models.Model):
   name = models.CharField(max_length=100)

class Constituency(models.Model):
   name = models.CharField(max_length=100)

class MP(models.Model):
   name = models.CharField(max_length=100)
   party = models.ForeignKey(Party)
   constituency = models.ForeignKey(Constituency)
   guardian_url = models.CharField(max_length=255,
      blank=True)
   guardian_image_url = models.CharField(max_length=255,
      blank=True)
class FinancialYear(models.Model):
   name = models.CharField(max_length=10)

class Document(models.Model):
   title = models.CharField(max_length=100, blank=True)
   filename = models.CharField(max_length=100)
   mp = models.ForeignKey(MP)
   financial_year = models.ForeignKey(FinancialYear)

class Page(models.Model):
   document = models.ForeignKey(Document)
   page_number = models.IntegerField()
class User(models.Model):
   created = models.DateTimeField(auto_now_add = True)
   username = models.TextField(max_length = 100)
   password_hash = models.CharField(max_length = 128, blank=True)

class LineItemCategory(models.Model):
   order = models.IntegerField(default = 0)
   name = models.CharField(max_length = 32)

class LineItem(models.Model):
   user = models.ForeignKey(User)
   page = models.ForeignKey(Page)
   type = models.CharField(max_length = 16, choices = (
       ('claim', 'claim'), ('proof', 'proof'),
   ), db_index = True)
   date = models.DateField(null = True, blank = True)
   amount = models.DecimalField(max_digits=20, decimal_places=2)
   description = models.CharField(max_length = 255, blank = True)
   created = models.DateTimeField(auto_now_add = True, db_index = True)
   categories = models.ManyToManyField(LineItemCategory, blank=True)
class Vote(models.Model):
   user = models.ForeignKey(User, related_name = 'votes')
   page = models.ForeignKey(Page, related_name = 'votes')
   obsolete = models.BooleanField(default = False)
   vote_type = models.CharField(max_length = 32, blank = True)
   ip_address = models.CharField(max_length = 32)
   created = models.DateTimeField(auto_now_add = True)

class TypeVote(Vote):
   type = models.CharField(max_length = 10, choices = (
      ('claim', 'Claim'), ('proof', 'Proof'),
      ('blank', 'Blank'), ('other', 'Other')
   ))

class InterestingVote(Vote):
   status = models.CharField(max_length = 10, choices = (
      ('no', 'Not interesting'), ('yes', 'Interesting'),
      ('known', 'Interesting but known'), ('very', 'Investigate this!'),
   ))
Frictionless
registration
Page filters
page_filters = (
    # Maps name of filter to dictionary of kwargs to doc.pages.filter()
    ('reviewed', {
        'votes__isnull': False
    }),
    ('unreviewed', {
        'votes__isnull': True
    }),
    ('with line items', {
        'line_items__isnull': False
    }),
    ('interesting', {
        'votes__interestingvote__status': 'yes'
    }),
    ('interesting but known', {
        'votes__interestingvote__status': 'known'
...
)
page_filters_lookup = dict(page_filters)
pages = doc.pages.all()
if page_filter:
    kwargs = page_filters_lookup.get(page_filter)
    if kwargs is None:
        raise Http404, 'Invalid page filter: %s' % page_filter
    pages = pages.filter(**kwargs).distinct()

# Build the filters
filters = []
for name, kwargs in page_filters:
   filters.append({
      'name': name,
      'count': doc.pages.filter(**kwargs).distinct().count(),
   })
Matching names
http://github.com/simonw/datamatcher
On the day
def get_mp_pages():
  "Returns list of (mp-name, mp-page-url) tuples"
  soup = Soup(urllib.urlopen(INDEX_URL))
  mp_links = []
  for link in soup.findAll('a'):
      if link.get('title', '').endswith("'s allowances"):
           mp_links.append(
             (link['title'].replace("'s allowances", ''), link['href'])
           )
  return mp_links
def get_pdfs(mp_url):
  "Returns list of (description, years, pdf-url, size) tuples"
  soup = Soup(urllib.urlopen(mp_url))
  pdfs = []
  trs = soup.findAll('tr')[1:] # Skip the first, it's the table header
  for tr in trs:
      name_td, year_td, pdf_td = tr.findAll('td')
      name = name_td.string
      year = year_td.string
      pdf_url = pdf_td.find('a')['href']
      size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '')
      pdfs.append(
         (name, year, pdf_url, size)
      )
  return pdfs
“Drop Everything”
Photoshop + AppleScript
           v.s.
     Java + IntelliJ
Images on our
docroot (S3 upload
was taking too long)
Blitz QA
Launch! (on EC2)
Crash #1: more
Apache children than
MySQL connections
unreviewed_count = Page.objects.filter(
   votes__isnull = True
).distinct().count()
SELECT
  COUNT(DISTINCT `expenses_page`.`id`)
FROM
  `expenses_page` LEFT OUTER JOIN `expenses_vote` ON (
     `expenses_page`.`id` = `expenses_vote`.`page_id`
  ) WHERE `expenses_vote`.`id` IS NULL
unreviewed_count = cache.get('homepage:unreviewed_count')
if unreviewed_count is None:
    unreviewed_count = Page.objects.filter(
       votes__isnull = True
    ).distinct().count()
    cache.set('homepage: unreviewed_count', unreviewed_count, 60)
• With 70,000 pages and a LOT of votes...
 • DB takes up 135% of CPU
• Cache the count in memcached...
 • DB drops to %35 of CPU
unreviewed_count = Page.objects.filter(
   votes__isnull = True
).distinct().count()

reviewed_count = Page.objects.filter(
   votes__isnull = False
).distinct().count()
unreviewed_count = Page.objects.filter(
   is_reviewed = False
).count()
Migrating to InnoDB
on a separate server
ssh mps-live "mysqldump mp_expenses" |
sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' |
  sed 's/CHARSET=latin1/CHARSET=utf8/g' |
  ssh mysql-big "mysql -u root mp_expenses"
“next” button
def next_global(request):
  # Next unreviewed page from the whole site
  all_unreviewed_pages = Page.objects.filter(
      is_reviewed = False
  ).order_by('?')
  if all_unreviewed_pages:
      return Redirect(
         all_unreviewed_pages[0].get_absolute_url()
      )
  else:
      return HttpResponse(
         'All pages have been reviewed!'
      )
import random

def next_global_from_cache(request):
  page_ids = cache.get('unreviewed_page_ids')
  if page_ids:
      return Redirect(
         '/page/%s/' % random.choice(page_ids)
      )
  else:
      return next_global(request)
from django.core.management.base import BaseCommand
from mp_expenses.expenses.models import Page
from django.core.cache import cache

class Command(BaseCommand):
   help = """
   populate unreviewed_page_ids in memcached
   """
   requires_model_validation = True
   can_import_settings = True
   def handle(self, *args, **options):
       ids = list(Page.objects.exclude(
          is_reviewed = True
       ).values_list('pk', flat=True)[:1000])
       cache.set('unreviewed_page_ids', ids)
The numbers
Final thoughts

• High score tables help
• MP photographs really help
• Keeping up the interest is hard
• Next step: start releasing the data

Crowdsourcing with Django

  • 1.
    Crowdsourcing with Django EuroPython, 30th June 2009 Simon Willison · http://simonwillison.net/ · @simonw
  • 2.
  • 3.
  • 4.
    November 2000 The Freedomof Information Act
  • 5.
    Heather Brooke • http://www.guardian.co.uk/politics/ 2009/may/08/mps-expenses-telegraph- checquebook-journalism • http://www.guardian.co.uk/politics/ 2009/may/15/mps-expenses-heather- brooke-foi
  • 6.
  • 7.
    January 2005 TheFOI request
  • 8.
    July 2006 The FOIcommissioner
  • 9.
    May 2007 The FOI(Amendment) Bill
  • 10.
  • 11.
  • 12.
  • 13.
  • 16.
    March 2009 The mole
  • 17.
    “All of thereceipts of 650-odd MPs, redacted and unredacted, are for sale at a price of £300,000, so I am told. The price is going up because of the interest in the subject.” Sir Stuart Bell, MP Newsnight, 30th March
  • 18.
    8th May, 2009 TheDaily Telegraph
  • 19.
  • 20.
    April: “Expenses aredue out in a couple of months, is there anything we can do?”
  • 21.
    June: “Expenses havebeen bumped forward, they’re out next week!”
  • 22.
    Thursday 11th June The proof-of-concept
  • 23.
    Monday 15th June Thetentative go-ahead
  • 24.
    Tuesday 16th June Designer+ client-side engineer
  • 25.
    Wednesday 17th June Operations engineer
  • 26.
  • 33.
  • 36.
  • 38.
  • 39.
    class Party(models.Model): name = models.CharField(max_length=100) class Constituency(models.Model): name = models.CharField(max_length=100) class MP(models.Model): name = models.CharField(max_length=100) party = models.ForeignKey(Party) constituency = models.ForeignKey(Constituency) guardian_url = models.CharField(max_length=255, blank=True) guardian_image_url = models.CharField(max_length=255, blank=True)
  • 40.
    class FinancialYear(models.Model): name = models.CharField(max_length=10) class Document(models.Model): title = models.CharField(max_length=100, blank=True) filename = models.CharField(max_length=100) mp = models.ForeignKey(MP) financial_year = models.ForeignKey(FinancialYear) class Page(models.Model): document = models.ForeignKey(Document) page_number = models.IntegerField()
  • 41.
    class User(models.Model): created = models.DateTimeField(auto_now_add = True) username = models.TextField(max_length = 100) password_hash = models.CharField(max_length = 128, blank=True) class LineItemCategory(models.Model): order = models.IntegerField(default = 0) name = models.CharField(max_length = 32) class LineItem(models.Model): user = models.ForeignKey(User) page = models.ForeignKey(Page) type = models.CharField(max_length = 16, choices = ( ('claim', 'claim'), ('proof', 'proof'), ), db_index = True) date = models.DateField(null = True, blank = True) amount = models.DecimalField(max_digits=20, decimal_places=2) description = models.CharField(max_length = 255, blank = True) created = models.DateTimeField(auto_now_add = True, db_index = True) categories = models.ManyToManyField(LineItemCategory, blank=True)
  • 42.
    class Vote(models.Model): user = models.ForeignKey(User, related_name = 'votes') page = models.ForeignKey(Page, related_name = 'votes') obsolete = models.BooleanField(default = False) vote_type = models.CharField(max_length = 32, blank = True) ip_address = models.CharField(max_length = 32) created = models.DateTimeField(auto_now_add = True) class TypeVote(Vote): type = models.CharField(max_length = 10, choices = ( ('claim', 'Claim'), ('proof', 'Proof'), ('blank', 'Blank'), ('other', 'Other') )) class InterestingVote(Vote): status = models.CharField(max_length = 10, choices = ( ('no', 'Not interesting'), ('yes', 'Interesting'), ('known', 'Interesting but known'), ('very', 'Investigate this!'), ))
  • 43.
  • 45.
  • 47.
    page_filters = ( # Maps name of filter to dictionary of kwargs to doc.pages.filter() ('reviewed', { 'votes__isnull': False }), ('unreviewed', { 'votes__isnull': True }), ('with line items', { 'line_items__isnull': False }), ('interesting', { 'votes__interestingvote__status': 'yes' }), ('interesting but known', { 'votes__interestingvote__status': 'known' ... ) page_filters_lookup = dict(page_filters)
  • 48.
    pages = doc.pages.all() ifpage_filter: kwargs = page_filters_lookup.get(page_filter) if kwargs is None: raise Http404, 'Invalid page filter: %s' % page_filter pages = pages.filter(**kwargs).distinct() # Build the filters filters = [] for name, kwargs in page_filters: filters.append({ 'name': name, 'count': doc.pages.filter(**kwargs).distinct().count(), })
  • 49.
  • 50.
  • 51.
  • 55.
    def get_mp_pages(): "Returns list of (mp-name, mp-page-url) tuples" soup = Soup(urllib.urlopen(INDEX_URL)) mp_links = [] for link in soup.findAll('a'): if link.get('title', '').endswith("'s allowances"): mp_links.append( (link['title'].replace("'s allowances", ''), link['href']) ) return mp_links
  • 56.
    def get_pdfs(mp_url): "Returns list of (description, years, pdf-url, size) tuples" soup = Soup(urllib.urlopen(mp_url)) pdfs = [] trs = soup.findAll('tr')[1:] # Skip the first, it's the table header for tr in trs: name_td, year_td, pdf_td = tr.findAll('td') name = name_td.string year = year_td.string pdf_url = pdf_td.find('a')['href'] size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '') pdfs.append( (name, year, pdf_url, size) ) return pdfs
  • 60.
  • 61.
    Photoshop + AppleScript v.s. Java + IntelliJ
  • 62.
    Images on our docroot(S3 upload was taking too long)
  • 63.
  • 64.
  • 66.
    Crash #1: more Apachechildren than MySQL connections
  • 69.
    unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count()
  • 70.
    SELECT COUNT(DISTINCT`expenses_page`.`id`) FROM `expenses_page` LEFT OUTER JOIN `expenses_vote` ON ( `expenses_page`.`id` = `expenses_vote`.`page_id` ) WHERE `expenses_vote`.`id` IS NULL
  • 71.
    unreviewed_count = cache.get('homepage:unreviewed_count') ifunreviewed_count is None: unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count() cache.set('homepage: unreviewed_count', unreviewed_count, 60)
  • 72.
    • With 70,000pages and a LOT of votes... • DB takes up 135% of CPU • Cache the count in memcached... • DB drops to %35 of CPU
  • 73.
    unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count() reviewed_count = Page.objects.filter( votes__isnull = False ).distinct().count()
  • 74.
    unreviewed_count = Page.objects.filter( is_reviewed = False ).count()
  • 75.
    Migrating to InnoDB ona separate server
  • 76.
    ssh mps-live "mysqldumpmp_expenses" | sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' | sed 's/CHARSET=latin1/CHARSET=utf8/g' | ssh mysql-big "mysql -u root mp_expenses"
  • 77.
  • 78.
    def next_global(request): # Next unreviewed page from the whole site all_unreviewed_pages = Page.objects.filter( is_reviewed = False ).order_by('?') if all_unreviewed_pages: return Redirect( all_unreviewed_pages[0].get_absolute_url() ) else: return HttpResponse( 'All pages have been reviewed!' )
  • 79.
    import random def next_global_from_cache(request): page_ids = cache.get('unreviewed_page_ids') if page_ids: return Redirect( '/page/%s/' % random.choice(page_ids) ) else: return next_global(request)
  • 80.
    from django.core.management.base importBaseCommand from mp_expenses.expenses.models import Page from django.core.cache import cache class Command(BaseCommand): help = """ populate unreviewed_page_ids in memcached """ requires_model_validation = True can_import_settings = True def handle(self, *args, **options): ids = list(Page.objects.exclude( is_reviewed = True ).values_list('pk', flat=True)[:1000]) cache.set('unreviewed_page_ids', ids)
  • 81.
  • 83.
    Final thoughts • Highscore tables help • MP photographs really help • Keeping up the interest is hard • Next step: start releasing the data