avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 1 | import requests |
| 2 | from bs4 import BeautifulSoup |
| 3 | import pyodbc |
| 4 | import re |
| 5 | from datetime import datetime, timedelta |
| 6 | import pytz |
| 7 | |
| 8 | class TableParser: |
| 9 | TIMEZONE = 'Europe/Madrid' |
| 10 | EMPTY_CELL_CLASSES = ['row_labels', 'even_row', 'odd_row'] |
| 11 | |
| 12 | def __init__(self, baseUrl): |
| 13 | self.baseUrl = baseUrl |
| 14 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 15 | def parse(self, year, month, day, area, db = None): |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 16 | url = self.baseUrl |
| 17 | params = { |
| 18 | 'year': year, |
| 19 | 'month': month, |
| 20 | 'day': day, |
| 21 | 'area': area |
| 22 | } |
| 23 | page = requests.get(url, params=params) |
| 24 | soup = BeautifulSoup(page.content, 'html.parser') |
| 25 | table = soup.find(id="day_main") |
| 26 | |
| 27 | hores = [] |
| 28 | for h in range(8,22): |
| 29 | newhour = "" |
| 30 | if h < 10: |
| 31 | newhour += "0" |
| 32 | newhour += str(h); |
| 33 | |
| 34 | hores.append(newhour + ":00"); |
| 35 | hores.append(newhour + ":30"); |
| 36 | |
| 37 | p = re.compile(r"Aula (\S+) ?\(\d*\)", re.IGNORECASE) |
| 38 | |
| 39 | for hora in hores: |
| 40 | td_hora = table.find(text=hora).findNext('td') |
| 41 | column = 1 |
| 42 | |
| 43 | while hora not in td_hora.get_text(): |
| 44 | |
| 45 | classes = td_hora['class']; |
| 46 | if td_hora.has_attr('class') and not td_hora['class'][0] in self.EMPTY_CELL_CLASSES: |
avm99963 | a5ac337 | 2020-09-30 20:38:47 +0200 | [diff] [blame^] | 47 | assignatura = td_hora.get_text().strip() |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 48 | aulaRaw = table.find_all("th")[column].get_text().strip() |
| 49 | aula = p.match(aulaRaw).group(1) |
| 50 | durada = int(td_hora.get("rowspan"))*30 |
| 51 | |
| 52 | timeSplit = hora.split(':') |
| 53 | |
avm99963 | 8f37631 | 2020-09-28 19:23:52 +0200 | [diff] [blame] | 54 | beginsDateTime = datetime(year, month, day, int(timeSplit[0]), int(timeSplit[1])) |
| 55 | beginsDateTime = pytz.timezone(self.TIMEZONE).localize(beginsDateTime) |
| 56 | begins = int(beginsDateTime.timestamp()) |
| 57 | endsDateTime = beginsDateTime + timedelta(minutes=durada) |
| 58 | ends = int(endsDateTime.timestamp()) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 59 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 60 | print(("Afegint " if db != None else "") + assignaturaRaw |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 61 | + ", " + hora |
| 62 | + ", " + str(durada) + "mins" |
| 63 | + ", " + aula) |
| 64 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 65 | if db != None: |
| 66 | cursor1 = db.cursor() |
| 67 | cursor1.execute("SELECT id FROM classes WHERE calendar_name = ? AND room = ? AND begins = ? AND ends = ?", |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 68 | assignatura, aula, begins, ends) |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 69 | row = cursor1.fetchone() |
| 70 | if row: |
| 71 | print("[WARNING] Ja estava a la DB (id " + str(row.id) + ")") |
| 72 | else: |
| 73 | cursor2 = db.cursor() |
| 74 | cursor2.execute("INSERT INTO classes (calendar_name, room, begins, ends) VALUES (?, ?, ?, ?)", |
| 75 | assignatura, aula, begins, ends) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 76 | |
| 77 | td_hora = td_hora.findNext('td') |
| 78 | column = column + 1 |
| 79 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame] | 80 | if db != None: |
| 81 | db.commit() |