Forums

My flask app's scraper doesn't seem to work

First i'll describe what I'm doing. I also want to mention that this does work on my local machine when i run it on localhost:5000.

I'm using pyppeteer to scrape some info from a website. Then when I have that info I am trying to do some calculations and display the results. For those interested im doing an EVE Online missle damage calculator that shows which missile types to use based on the persons zkillboard fitting data;)

Here is my code :

import asyncio
from pyppeteer import launch
from bs4 import BeautifulSoup

async def scrape_website(url):
    browser = await launch(handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False)

    page = await browser.newPage()
    page_zkill = await browser.newPage()

    await page_zkill.goto(url)  # use the passed url

    bodyHTMLZKILL = await page_zkill.evaluate('''() => {
        return document.body.innerHTML;
    }''')

    soupZKILL = BeautifulSoup(bodyHTMLZKILL, 'html.parser')

    # Fetching the desired ZKILL values
    eveworkbench_div = soupZKILL.find('div', id='eveworkbench')

    if eveworkbench_div is None:
        print("Eve Workbench div not found")
        return None

    a_tag = eveworkbench_div.find('a', string='Eve Workbench')

    if a_tag is None:
        print("Eve Workbench link not found")
        return None

    href = a_tag.get('href')
    #print(href)

    eveworkbench_url = href

    await page.goto(eveworkbench_url)
    await asyncio.sleep(1.5)

    bodyHTML = await page.evaluate('''() => {
        return document.body.innerHTML;
    }''')

    soup = BeautifulSoup(bodyHTML, 'html.parser')

    # Fetching the desired values
    results = {}

    results['passive_shield_recharge_rate'] = float(soup.select_one('span[data-extended-detail="shieldPassiveRechargeRateEHP"]').text)
    results['shield_boost_rate'] = float(soup.select_one('span[data-extended-detail="shieldActiveRechargeRateEHP"]').text)
    results['armor_rep_rate'] = float(soup.select_one('span[data-extended-detail="armorRepairRateEHP"]').text)

    results['shieldEHP'] = float(soup.select_one('span[data-extended-detail="shieldEHP"]').text)
    results['armorEHP'] = float(soup.select_one('span[data-extended-detail="armorEHP"]').text)
    results['hullEHP'] = float(soup.select_one('span[data-extended-detail="hullEHP"]').text)

    results['shieldEM'] = float(soup.select_one('span[data-extended-detail="shieldEM"]').text)*0.01
    results['shieldTH'] = float(soup.select_one('span[data-extended-detail="shieldTH"]').text)*0.01
    results['shieldKI'] = float(soup.select_one('span[data-extended-detail="shieldKI"]').text)*0.01
    results['shieldEX'] = float(soup.select_one('span[data-extended-detail="shieldEX"]').text)*0.01

    results['armorEM'] = float(soup.select_one('span[data-extended-detail="armorEM"]').text)*0.01
    results['armorTH'] = float(soup.select_one('span[data-extended-detail="armorTH"]').text)*0.01
    results['armorKI'] = float(soup.select_one('span[data-extended-detail="armorKI"]').text)*0.01
    results['armorEX'] = float(soup.select_one('span[data-extended-detail="armorEX"]').text)*0.01

    results['hullEM'] = float(soup.select_one('span[data-extended-detail="hullEM"]').text)*0.01
    results['hullTH'] = float(soup.select_one('span[data-extended-detail="hullTH"]').text)*0.01
    results['hullKI'] = float(soup.select_one('span[data-extended-detail="hullKI"]').text)*0.01
    results['hullEX'] = float(soup.select_one('span[data-extended-detail="hullEX"]').text)*0.01

    results['eveworkbench_url'] = eveworkbench_url

    await browser.close()

    return results  # return the results

And in my flask app script im using the scrape_website method like this:

from flask import Flask, render_template, request
from concurrent.futures import ThreadPoolExecutor
import asyncio
from scraper import scrape_website  # import the scrape_website function
from scraper import calculate_best_damage

app = Flask(__name__)
executor = ThreadPoolExecutor(max_workers=5)  # create a ThreadPoolExecutor

def run_asyncio_task(task):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    return loop.run_until_complete(task)

@app.route('/', methods=['GET', 'POST'])
def index():
    url = ''

    kestrelDamage = 400
    hookbillBaseDamage = 561
    hookbillKineticDamage = 631

    hookbill_rate_of_fire = 1.8
    kestrel_rate_of_fire = 2.5

    damage = {}
    damage['em'] = hookbillBaseDamage
    damage['thermal'] = hookbillBaseDamage
    damage['kinetic'] = hookbillKineticDamage
    damage['explosive'] = hookbillBaseDamage
    damage['rof'] = hookbill_rate_of_fire # rate of fire

    if request.method == 'POST':
        url = request.form['url_zkill']

        damage['em'] = float(request.form.get('em_dmg'))
        damage['thermal'] = float(request.form.get('thermal_dmg'))
        damage['kinetic'] = float(request.form.get('kinetic_dmg'))
        damage['explosive'] = float(request.form.get('explosive_dmg'))
        damage['rof'] = float(request.form.get('rate_of_fire')) # rate of fire

        if len(url) > 1:
            future = executor.submit(run_asyncio_task, scrape_website(url))  # submit the task to the executor
            results = future.result()  # wait for the task to complete and get the result
            calculation_result = calculate_best_damage(results,damage)
            show_workbench_data = request.form.get('show_workbench_data')  # get the value of the checkbox
            show_damage_data = request.form.get('show_damage_data')
            return render_template('index.html', results=results, calculation_results=calculation_result,damage=damage,show_workbench_data=show_workbench_data,show_damage_data=show_damage_data, url=url)  # pass the value of the checkbox to the template
        else:
            return render_template('index.html', url="https://zkillboard.com/kill/109853948/", damage=damage)
    else:
        return render_template('index.html', url=url, damage=damage)

if __name__ == "__main__":
    app.run(debug=True)

The error log says that the browser closed unexpectedly? Does anyone have any idea what I'm doing wrong or whats happening that is making it close? Is it PythonANywhere that is closing the browser?

Here is the log files error :

2023-06-24 19:56:28,993: terminate chrome process...
2023-06-24 19:57:33,977: Exception on / [POST]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2077, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1525, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1523, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1509, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "/home/Toolism/./app.py", line 44, in index
    results = future.result()  # wait for the task to complete and get the result
  File "/usr/local/lib/python3.10/concurrent/futures/_base.py", line 446, in result
    return self.__get_result()
  File "/usr/local/lib/python3.10/concurrent/futures/_base.py", line 391, in __get_result
    raise self._exception
  File "/usr/local/lib/python3.10/concurrent/futures/thread.py", line 58, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/home/Toolism/./app.py", line 13, in run_asyncio_task
    return loop.run_until_complete(task)
  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 646, in run_until_complete
    return future.result()
  File "/home/Toolism/./scraper.py", line 6, in scrape_website
    browser = await launch(handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False)
  File "/home/Toolism/.local/lib/python3.10/site-packages/pyppeteer/launcher.py", line 307, in launch
    return await Launcher(options, **kwargs).launch()
  File "/home/Toolism/.local/lib/python3.10/site-packages/pyppeteer/launcher.py", line 168, in launch
    self.browserWSEndpoint = get_ws_endpoint(self.url)
  File "/home/Toolism/.local/lib/python3.10/site-packages/pyppeteer/launcher.py", line 227, in get_ws_endpoint
    raise BrowserError('Browser closed unexpectedly:\n')
pyppeteer.errors.BrowserError: Browser closed unexpectedly:
**NO MATCH**

We don't support multithreading in website code, so I suspect that's the underlying cause of this issue. Check out this help page on doing async work in websites for some suggestions of alternative ways to do that kind of thing.

Thank you very much!:) I am a game developer that has no idea how to write web based stuff, I wrote all this with some heavy tutorial reading and some chatGPT help. Didn't cross my mind to not do it the way the tutorials have done it. Thank you very much. I now have much more reading to do hehe but at least this gives me a way to move forward to:)

Glad to hear that you are moving forward!