parser.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. #!/usr/bin/env python -OO
  2. # -*- coding: utf-8 -*-
  3. from __future__ import with_statement
  4. from bs4 import BeautifulSoup
  5. from glob import glob
  6. import argparse
  7. import os
  8. import re
  9. import sqlite3
  10. import sys
  11. def main(args):
  12. """Loop thru all the games and parse them."""
  13. if not os.path.isdir(args.dir):
  14. print ("The specified folder is not a directory.")
  15. sys.exit(1)
  16. NUMBER_OF_FILES = len(os.listdir(args.dir))
  17. if args.num_of_files:
  18. NUMBER_OF_FILES = args.num_of_files
  19. print ("Parsing", NUMBER_OF_FILES, "files")
  20. sql = None
  21. if not args.stdout:
  22. sql = sqlite3.connect(args.database)
  23. sql.execute("""PRAGMA foreign_keys = ON;""")
  24. sql.execute("""CREATE TABLE airdates(
  25. game INTEGER PRIMARY KEY,
  26. airdate TEXT
  27. );""")
  28. sql.execute("""CREATE TABLE documents(
  29. id INTEGER PRIMARY KEY AUTOINCREMENT,
  30. clue TEXT,
  31. answer TEXT
  32. );""")
  33. sql.execute("""CREATE TABLE categories(
  34. id INTEGER PRIMARY KEY AUTOINCREMENT,
  35. category TEXT UNIQUE
  36. );""")
  37. sql.execute("""CREATE TABLE clues(
  38. id INTEGER PRIMARY KEY AUTOINCREMENT,
  39. game INTEGER,
  40. round INTEGER,
  41. value INTEGER,
  42. FOREIGN KEY(id) REFERENCES documents(id),
  43. FOREIGN KEY(game) REFERENCES airdates(game)
  44. );""")
  45. sql.execute("""CREATE TABLE classifications(
  46. clue_id INTEGER,
  47. category_id INTEGER,
  48. FOREIGN KEY(clue_id) REFERENCES clues(id),
  49. FOREIGN KEY(category_id) REFERENCES categories(id)
  50. );""")
  51. for i, file_name in enumerate(glob(os.path.join(args.dir, "*.html")), 1):
  52. with open(os.path.abspath(file_name)) as f:
  53. parse_game(f, sql, i)
  54. if not args.stdout:
  55. sql.commit()
  56. print ("All done")
  57. def parse_game(f, sql, gid):
  58. """Parses an entire Jeopardy! game and extract individual clues."""
  59. bsoup = BeautifulSoup(f, "lxml")
  60. # The title is in the format: `J! Archive - Show #XXXX, aired 2004-09-16`,
  61. # where the last part is all that is required
  62. airdate = bsoup.title.get_text().split()[-1]
  63. if not parse_round(bsoup, sql, 1, gid, airdate) or not parse_round(bsoup, sql, 2, gid, airdate):
  64. # One of the rounds does not exist
  65. pass
  66. # The final Jeopardy! round
  67. r = bsoup.find("table", class_="final_round")
  68. if not r:
  69. # This game does not have a final clue
  70. return
  71. category = r.find("td", class_="category_name").get_text()
  72. text = r.find("td", class_="clue_text").get_text()
  73. answer = BeautifulSoup(r.find("div", onmouseover=True).get("onmouseover"), "lxml")
  74. answer = answer.find("em").get_text()
  75. # False indicates no preset value for a clue
  76. insert(sql, [gid, airdate, 3, category, False, text, answer])
  77. def parse_round(bsoup, sql, rnd, gid, airdate):
  78. """Parses and inserts the list of clues from a whole round."""
  79. round_id = "jeopardy_round" if rnd == 1 else "double_jeopardy_round"
  80. r = bsoup.find(id=round_id)
  81. # The game may not have all the rounds
  82. if not r:
  83. return False
  84. # The list of categories for this round
  85. categories = [c.get_text() for c in r.find_all("td", class_="category_name")]
  86. # The x_coord determines which category a clue is in
  87. # because the categories come before the clues, we will
  88. # have to match them up with the clues later on.
  89. x = 0
  90. for a in r.find_all("td", class_="clue"):
  91. is_missing = True if not a.get_text().strip() else False
  92. if not is_missing:
  93. value = a.find("td", class_=re.compile("clue_value")).get_text().lstrip("D: $")
  94. text = a.find("td", class_="clue_text").get_text()
  95. answer = BeautifulSoup(a.find("div", onmouseover=True).get("onmouseover"), "lxml")
  96. answer = answer.find("em", class_="correct_response").get_text()
  97. insert(sql, [gid, airdate, rnd, categories[x], value, text, answer])
  98. # Always update x, even if we skip
  99. # a clue, as this keeps things in order. there
  100. # are 6 categories, so once we reach the end,
  101. # loop back to the beginning category.
  102. #
  103. # Using modulus is slower, e.g.:
  104. #
  105. # x += 1
  106. # x %= 6
  107. #
  108. x = 0 if x == 5 else x + 1
  109. return True
  110. def insert(sql, clue):
  111. """Inserts the given clue into the database."""
  112. # Clue is [game, airdate, round, category, value, clue, answer]
  113. # Note that at this point, clue[4] is False if round is 3
  114. if "\\\'" in clue[6]:
  115. clue[6] = clue[6].replace("\\\'", "'")
  116. if "\\\"" in clue[6]:
  117. clue[6] = clue[6].replace("\\\"", "\"")
  118. if not sql:
  119. print (clue)
  120. return
  121. sql.execute(
  122. "INSERT OR IGNORE INTO airdates VALUES(?, ?);",
  123. (clue[0], clue[1], )
  124. )
  125. sql.execute("INSERT OR IGNORE INTO categories(category) VALUES(?);", (clue[3], ))
  126. category_id = sql.execute("SELECT id FROM categories WHERE category=?;", (clue[3], )).fetchone()[0]
  127. clue_id = sql.execute("INSERT INTO documents(clue, answer) VALUES(?, ?);", (clue[5], clue[6], )).lastrowid
  128. sql.execute("INSERT INTO clues(game, round, value) VALUES(?, ?, ?);", (clue[0], clue[2], clue[4], ))
  129. sql.execute("INSERT INTO classifications VALUES(?, ?)", (clue_id, category_id, ))
  130. if __name__ == "__main__":
  131. parser = argparse.ArgumentParser(
  132. description="Parse games from the J! Archive website.", add_help=False,
  133. usage="%(prog)s [options]")
  134. parser.add_argument("-d", "--dir", dest="dir", metavar="<folder>",
  135. help="the directory containing the game files",
  136. default="j-archive")
  137. parser.add_argument("-n", "--number-of-files", dest="num_of_files",
  138. metavar="<number>", help="the number of files to parse",
  139. type=int)
  140. parser.add_argument("-f", "--filename", dest="database",
  141. metavar="<filename>",
  142. help="the filename for the SQLite database",
  143. default="clues.db")
  144. parser.add_argument("--stdout",
  145. help="output the clues to stdout and not a database",
  146. action="store_true")
  147. parser.add_argument("--help", action="help",
  148. help="show this help message and exit")
  149. parser.add_argument("--version", action="version", version="2014.09.14")
  150. main(parser.parse_args())