avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 1 | import requests |
| 2 | from bs4 import BeautifulSoup |
| 3 | import pyodbc |
| 4 | import re |
| 5 | from datetime import datetime, timedelta |
| 6 | import pytz |
| 7 | |
| 8 | class TableParser: |
| 9 | TIMEZONE = 'Europe/Madrid' |
| 10 | EMPTY_CELL_CLASSES = ['row_labels', 'even_row', 'odd_row'] |
| 11 | |
| 12 | def __init__(self, baseUrl): |
| 13 | self.baseUrl = baseUrl |
| 14 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 15 | def parse(self, year, month, day, area, db = None): |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 16 | url = self.baseUrl |
| 17 | params = { |
| 18 | 'year': year, |
| 19 | 'month': month, |
| 20 | 'day': day, |
| 21 | 'area': area |
| 22 | } |
| 23 | page = requests.get(url, params=params) |
| 24 | soup = BeautifulSoup(page.content, 'html.parser') |
| 25 | table = soup.find(id="day_main") |
| 26 | |
| 27 | hores = [] |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 28 | implicitClasses = [] |
| 29 | |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 30 | for h in range(8,22): |
| 31 | newhour = "" |
| 32 | if h < 10: |
| 33 | newhour += "0" |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 34 | newhour += str(h) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 35 | |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 36 | hores.append(newhour + ":00") |
| 37 | implicitClasses.append([]) |
| 38 | hores.append(newhour + ":30") |
| 39 | implicitClasses.append([]) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 40 | |
| 41 | p = re.compile(r"Aula (\S+) ?\(\d*\)", re.IGNORECASE) |
| 42 | |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 43 | horaActual = 0 |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 44 | for hora in hores: |
| 45 | td_hora = table.find(text=hora).findNext('td') |
| 46 | column = 1 |
| 47 | |
| 48 | while hora not in td_hora.get_text(): |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 49 | while column in implicitClasses[horaActual]: |
| 50 | column += 1 |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 51 | |
| 52 | classes = td_hora['class']; |
| 53 | if td_hora.has_attr('class') and not td_hora['class'][0] in self.EMPTY_CELL_CLASSES: |
avm99963 | a5ac337 | 2020-09-30 20:38:47 +0200 | [diff] [blame] | 54 | assignatura = td_hora.get_text().strip() |
delefme | ca4c338 | 2020-10-06 19:34:36 +0200 | [diff] [blame] | 55 | degree = td_hora.get("class")[0] |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 56 | aulaRaw = table.find_all("th")[column].get_text().strip() |
| 57 | aula = p.match(aulaRaw).group(1) |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 58 | files = int(td_hora.get("rowspan")) |
| 59 | durada = files*30 |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 60 | |
| 61 | timeSplit = hora.split(':') |
| 62 | |
avm99963 | 8f37631 | 2020-09-28 19:23:52 +0200 | [diff] [blame] | 63 | beginsDateTime = datetime(year, month, day, int(timeSplit[0]), int(timeSplit[1])) |
| 64 | beginsDateTime = pytz.timezone(self.TIMEZONE).localize(beginsDateTime) |
| 65 | begins = int(beginsDateTime.timestamp()) |
| 66 | endsDateTime = beginsDateTime + timedelta(minutes=durada) |
| 67 | ends = int(endsDateTime.timestamp()) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 68 | |
avm99963 | acf5132 | 2020-10-01 01:56:24 +0200 | [diff] [blame] | 69 | print(("Afegint " if db != None else "") + assignatura |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 70 | + ", " + hora |
| 71 | + ", " + str(durada) + "mins" |
delefme | ca4c338 | 2020-10-06 19:34:36 +0200 | [diff] [blame] | 72 | + ", " + aula |
| 73 | + ", " + degree) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 74 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 75 | if db != None: |
| 76 | cursor1 = db.cursor() |
avm99963 | 44fc929 | 2020-10-07 00:59:26 +0200 | [diff] [blame] | 77 | cursor1.execute("SELECT id FROM classes WHERE calendar_name = ? AND room = ? AND begins = ? AND ends = ? AND degree = ?", |
| 78 | assignatura, aula, begins, ends, degree) |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 79 | row = cursor1.fetchone() |
| 80 | if row: |
| 81 | print("[WARNING] Ja estava a la DB (id " + str(row.id) + ")") |
| 82 | else: |
| 83 | cursor2 = db.cursor() |
delefme | ca4c338 | 2020-10-06 19:34:36 +0200 | [diff] [blame] | 84 | cursor2.execute("INSERT INTO classes (calendar_name, room, begins, ends, degree) VALUES (?, ?, ?, ?, ?)", |
| 85 | assignatura, aula, begins, ends, degree) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 86 | |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 87 | for i in range(1, files - 1): |
| 88 | if i < len(implicitClasses): |
| 89 | implicitClasses[horaActual + i].append(column) |
| 90 | |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 91 | td_hora = td_hora.findNext('td') |
| 92 | column = column + 1 |
| 93 | |
avm99963 | e76c482 | 2020-10-06 18:37:11 +0200 | [diff] [blame] | 94 | horaActual = horaActual + 1 |
| 95 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 96 | if db != None: |
| 97 | db.commit() |