Afegida integració amb MySQL per table-parser
S'ha encapsulat el codi que parseja les classes d'un dia en una classe
de Python (al fitxer TableParser.py), i s'ha creat el fitxer
cron-parse-tables.py que quan es crida utilitza la classe TableParser
per afegir a la base de dades les classes del dia següent.
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fd1ac68
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+config.ini
+__pycache__/
diff --git a/TableParser.py b/TableParser.py
new file mode 100644
index 0000000..76eefd5
--- /dev/null
+++ b/TableParser.py
@@ -0,0 +1,78 @@
+import requests
+from bs4 import BeautifulSoup
+import pyodbc
+import re
+from datetime import datetime, timedelta
+import pytz
+
+class TableParser:
+ TIMEZONE = 'Europe/Madrid'
+ EMPTY_CELL_CLASSES = ['row_labels', 'even_row', 'odd_row']
+
+ def __init__(self, baseUrl):
+ self.baseUrl = baseUrl
+
+ def parse(self, year, month, day, area, db):
+ url = self.baseUrl
+ params = {
+ 'year': year,
+ 'month': month,
+ 'day': day,
+ 'area': area
+ }
+ page = requests.get(url, params=params)
+ soup = BeautifulSoup(page.content, 'html.parser')
+ table = soup.find(id="day_main")
+
+ hores = []
+ for h in range(8,22):
+ newhour = ""
+ if h < 10:
+ newhour += "0"
+ newhour += str(h);
+
+ hores.append(newhour + ":00");
+ hores.append(newhour + ":30");
+
+ p = re.compile(r"Aula (\S+) ?\(\d*\)", re.IGNORECASE)
+
+ for hora in hores:
+ td_hora = table.find(text=hora).findNext('td')
+ column = 1
+
+ while hora not in td_hora.get_text():
+
+ classes = td_hora['class'];
+ if td_hora.has_attr('class') and not td_hora['class'][0] in self.EMPTY_CELL_CLASSES:
+ assignaturaRaw = td_hora.get_text().strip()
+ assignatura = assignaturaRaw.lower()
+ aulaRaw = table.find_all("th")[column].get_text().strip()
+ aula = p.match(aulaRaw).group(1)
+ durada = int(td_hora.get("rowspan"))*30
+
+ timeSplit = hora.split(':')
+
+ begins = datetime(year, month, day, int(timeSplit[0]), int(timeSplit[1]))
+ begins = pytz.timezone(self.TIMEZONE).localize(begins)
+ ends = begins + timedelta(minutes=durada)
+
+ print("Afegint " + assignaturaRaw
+ + ", " + hora
+ + ", " + str(durada) + "mins"
+ + ", " + aula)
+
+ cursor1 = db.cursor()
+ cursor1.execute("SELECT id FROM classes WHERE calendar_name = ? AND room = ? AND begins = ? AND ends = ?",
+ assignatura, aula, begins, ends)
+ row = cursor1.fetchone()
+ if row:
+ print("[WARNING] Ja estava a la DB (id " + str(row.id) + ")")
+ else:
+ cursor2 = db.cursor()
+ cursor2.execute("INSERT INTO classes (calendar_name, room, begins, ends) VALUES (?, ?, ?, ?)",
+ assignatura, aula, begins, ends)
+
+ td_hora = td_hora.findNext('td')
+ column = column + 1
+
+ db.commit()
diff --git a/config.ini.default b/config.ini.default
new file mode 100644
index 0000000..59c6506
--- /dev/null
+++ b/config.ini.default
@@ -0,0 +1,5 @@
+[db]
+host = localhost
+database = covid_tracability
+user = user
+password = password
diff --git a/cron-parse-tables.py b/cron-parse-tables.py
new file mode 100644
index 0000000..364ecf4
--- /dev/null
+++ b/cron-parse-tables.py
@@ -0,0 +1,31 @@
+import pyodbc
+import configparser
+from TableParser import TableParser
+import datetime
+
+config = configparser.ConfigParser()
+config.read('config.ini')
+
+db_host = config['db']['host']
+db_database = config['db']['database']
+db_user = config['db']['user']
+db_password = config['db']['password']
+
+connection_string = (
+ 'DRIVER=MySQL ODBC 8.0 ANSI Driver;'
+ 'SERVER=' + db_host + ';'
+ 'DATABASE=' + db_database + ';'
+ 'UID=' + db_user + ';'
+ 'PWD=' + db_password + ';'
+ 'charset=utf8mb4;'
+)
+
+db = pyodbc.connect(connection_string)
+db.setdecoding(pyodbc.SQL_WCHAR, encoding='utf-8')
+db.setencoding(encoding='utf-8')
+
+tomorrow = datetime.date.today() + datetime.timedelta(days=1)
+
+parser = TableParser('https://fme-intranet.upc.edu/appsext/mrbs/web/day.php')
+for area in [2, 6]:
+ parser.parse(tomorrow.year, tomorrow.month, tomorrow.day, area, db)
diff --git a/table-parser.py b/table-parser.py
deleted file mode 100644
index 63f3200..0000000
--- a/table-parser.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-
-url = "https://fme-intranet.upc.edu/appsext/mrbs/web/day.php?year=2020&month=9&day=17&area=2"
-page = requests.get(url)
-soup = BeautifulSoup(page.content, 'html.parser')
-table = soup.find(id="day_main")
-
-hores = []
-for h in range(8,22):
- newhour = ""
- if h < 10:
- newhour += "0"
- newhour += str(h);
-
- hores.append(newhour + ":00");
- hores.append(newhour + ":30");
-
-for hora in hores:
- td_hora = table.find(text=hora).findNext('td')
- column = 1
-
- while hora not in td_hora.get_text():
-
- if "CDATA" not in td_hora.get_text():
- assignatura = td_hora.get_text().strip()
- aula = table.find_all("th")[column].get_text().strip()
- durada = int(td_hora.get("rowspan"))*30
-
- print(assignatura
- + ", " + hora
- + ", " + str(durada) + "mins"
- + ", " + aula)
-
- td_hora = td_hora.findNext('td')
- column = column + 1