import hashlib import io from pathlib import Path from typing import Tuple import pytesseract import requests from PIL import Image from bs4 import BeautifulSoup from flask import Flask, render_template, make_response app = Flask(__name__) url = 'https://bueze.de/unser-mittagstisch/' image_hash_file = Path('.menu_image_hash') menu_text = Path("./menu.txt") def find_image_url(): with requests.get(url) as response: response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') image = soup.find('figure', class_='wp-block-image').find('img') return image.get('src') def get_image_bytes(image_url): with requests.get(image_url, stream=True) as response: response.raise_for_status() return response.raw.read() def is_same_hash(img_bytes): if image_hash_file.exists(): previous_hash = image_hash_file.read_text() new_hash = hashlib.md5(img_bytes).hexdigest() return previous_hash == new_hash return False def write_html(text): with menu_text.open('w') as f: f.write(text) def get_menu() -> Tuple[Path, str]: """ Get the menu text and image URL. :return: menu_text, image_url """ image_url = find_image_url() image_bytes = get_image_bytes(image_url) # Check if the image has changed if is_same_hash(image_bytes) and menu_text.exists(): print('No new image') # If the image has changed, write the new hash to the file and extract the text else: image_hash_file.write_text(hashlib.md5(image_bytes).hexdigest()) image = Image.open(io.BytesIO(image_bytes)) text = pytesseract.image_to_string(image, lang='deu') print('New image found') write_html(text) return menu_text, image_url @app.route('/') def root(): """ This view function returns the menu text and image as a web page. """ text, image = get_menu() lines = menu_text.read_text().splitlines() html = render_template('base.html', lines=lines, img=image, text=text.read_text()) return make_response(html, 200) @app.route('/plain') def plain(): """ This view function returns the menu text as plain text. """ text, _ = get_menu() response = make_response(text.read_text(), 200) response.mimetype = 'text/plain' return response