avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 1 | import requests |
| 2 | from bs4 import BeautifulSoup |
| 3 | import pyodbc |
| 4 | import re |
| 5 | from datetime import datetime, timedelta |
| 6 | import pytz |
| 7 | |
| 8 | class TableParser: |
| 9 | TIMEZONE = 'Europe/Madrid' |
| 10 | EMPTY_CELL_CLASSES = ['row_labels', 'even_row', 'odd_row'] |
| 11 | |
| 12 | def __init__(self, baseUrl): |
| 13 | self.baseUrl = baseUrl |
| 14 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame^] | 15 | def parse(self, year, month, day, area, db = None): |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 16 | url = self.baseUrl |
| 17 | params = { |
| 18 | 'year': year, |
| 19 | 'month': month, |
| 20 | 'day': day, |
| 21 | 'area': area |
| 22 | } |
| 23 | page = requests.get(url, params=params) |
| 24 | soup = BeautifulSoup(page.content, 'html.parser') |
| 25 | table = soup.find(id="day_main") |
| 26 | |
| 27 | hores = [] |
| 28 | for h in range(8,22): |
| 29 | newhour = "" |
| 30 | if h < 10: |
| 31 | newhour += "0" |
| 32 | newhour += str(h); |
| 33 | |
| 34 | hores.append(newhour + ":00"); |
| 35 | hores.append(newhour + ":30"); |
| 36 | |
| 37 | p = re.compile(r"Aula (\S+) ?\(\d*\)", re.IGNORECASE) |
| 38 | |
| 39 | for hora in hores: |
| 40 | td_hora = table.find(text=hora).findNext('td') |
| 41 | column = 1 |
| 42 | |
| 43 | while hora not in td_hora.get_text(): |
| 44 | |
| 45 | classes = td_hora['class']; |
| 46 | if td_hora.has_attr('class') and not td_hora['class'][0] in self.EMPTY_CELL_CLASSES: |
| 47 | assignaturaRaw = td_hora.get_text().strip() |
| 48 | assignatura = assignaturaRaw.lower() |
| 49 | aulaRaw = table.find_all("th")[column].get_text().strip() |
| 50 | aula = p.match(aulaRaw).group(1) |
| 51 | durada = int(td_hora.get("rowspan"))*30 |
| 52 | |
| 53 | timeSplit = hora.split(':') |
| 54 | |
| 55 | begins = datetime(year, month, day, int(timeSplit[0]), int(timeSplit[1])) |
| 56 | begins = pytz.timezone(self.TIMEZONE).localize(begins) |
| 57 | ends = begins + timedelta(minutes=durada) |
| 58 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame^] | 59 | print(("Afegint " if db != None else "") + assignaturaRaw |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 60 | + ", " + hora |
| 61 | + ", " + str(durada) + "mins" |
| 62 | + ", " + aula) |
| 63 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame^] | 64 | if db != None: |
| 65 | cursor1 = db.cursor() |
| 66 | cursor1.execute("SELECT id FROM classes WHERE calendar_name = ? AND room = ? AND begins = ? AND ends = ?", |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 67 | assignatura, aula, begins, ends) |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame^] | 68 | row = cursor1.fetchone() |
| 69 | if row: |
| 70 | print("[WARNING] Ja estava a la DB (id " + str(row.id) + ")") |
| 71 | else: |
| 72 | cursor2 = db.cursor() |
| 73 | cursor2.execute("INSERT INTO classes (calendar_name, room, begins, ends) VALUES (?, ?, ?, ?)", |
| 74 | assignatura, aula, begins, ends) |
avm99963 | 411e36a | 2020-09-27 23:32:48 +0200 | [diff] [blame] | 75 | |
| 76 | td_hora = td_hora.findNext('td') |
| 77 | column = column + 1 |
| 78 | |
avm99963 | 0a75b65 | 2020-09-27 23:53:03 +0200 | [diff] [blame^] | 79 | if db != None: |
| 80 | db.commit() |