I'm currently designing a discord bot that scrapes a web page that is constantly updating for patches related to a PBE server. I have the bot running through Heroku successfully right now. The issue I'm running into is I want to create an automated (timed loop) refresh that will reload the website I have requested. As it currently stands, it only loads one instance of the website and if that website changes/updates, none of my content will update as I'm using the "old" request of the website.
Is there a way for me to bury code inside a function so that I can create a timed loop or do I only need to create one around my website request? How would that look? Thanks!
from bs4 import BeautifulSoup
from urllib.request import urlopen
from discord.ext import commands
import discord
# what I want the commands to start with
bot = commands.Bot(command_prefix='!')
# instantiating discord client
token = "************************************"
client = discord.Client()
# begin the scraping of passed in web page
URL = "*********************************"
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'}) # using soup to find all header tags with the news-title
# class and storing them in pbe_titles
linksAndTitles = []
counter = 0
# finding tags that start with 'a' as in a href and appending those titles/links
for tag in pbe_titles:
for anchor in tag.find_all('a'):
linksAndTitles.append(tag.text.strip())
linksAndTitles.append(anchor['href'])
# counts number of lines stored inside linksAndTitles list
for i in linksAndTitles:
counter = counter + 1
print(counter)
# separates list by line so that it looks nice when printing
allPatches = '\n'.join(str(line) for line in linksAndTitles[:counter])
# stores the first two lines in list which is the current pbe patch title and link
currPatch = '\n'.join(str(line) for line in linksAndTitles[:2])
# command that allows user to type in exactly what patch they want to see information for based off date
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg):
if any(item.startswith(arg) for item in linksAndTitles):
await ctx.send(arg + " exists!")
else:
await ctx.send('The date you entered: ' + '"' + arg + '"' + ' does not have a patch associated with it or that patch expired.')
# command that displays the current, most up to date, patch
@bot.command(name='current')
async def current_patch(ctx):
response = currPatch
await ctx.send(response)
bot.run(token)
I've played around with
while True:
loops but whenever I nest anything inside of them, I can't access the code in other places.
discord
has special decorator tasks
to run some code periodically
from discord.ext import tasks
@tasks.loop(seconds=5.0)
async def scrape():
# ... your scraping code ...
# ... your commands ...
scrape.start()
bot.run(token)
and it will repeate function scrape
every 5 seconds.
Documentation: tasks
On Linux eventually I would use standard service cron
to run periodically some script. This script could scrape data and save in file or database and discord
could read from this file or database. But cron
check tasks every 1 minute so it can't run task more often.
EDIT:
Minimal working code.
I use page http://books.toscrape.com created for scrape learning.
I changed few elements. There is no need to create client
when there is bot
because bot
is special kind of client
I keep title
and link
as dictionary
{
'title': tag.text.strip(),
'link': url + anchor['href'],
}
so later it is easier to create text like
title: A Light in the ...
link: http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
import os
import discord
from discord.ext import commands, tasks
from bs4 import BeautifulSoup
from urllib.request import urlopen
# default value at start (before `scrape` will assign new value)
# because some function may try to use these variables before `scrape` will create them
links_and_titles = [] # PEP8: `lower_case_namese`
counter = 0
items = []
bot = commands.Bot(command_prefix='!')
@tasks.loop(seconds=5)
async def scrape():
global links_and_titles
global counter
global items
url = "http://books.toscrape.com/"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'})
pbe_titles = soup.find_all('h3')
# remove previous content
links_and_titles = []
for tag in pbe_titles:
for anchor in tag.find_all('a'):
links_and_titles.append({
'title': tag.text.strip(),
'link': url + anchor['href'],
})
counter = len(links_and_titles)
print('counter:', counter)
items = [f"title: {x['title']}\nlink: {x['link']}" for x in links_and_titles]
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg=None):
if arg is None:
await ctx.send('Use: !patch date')
elif any(item['title'].startswith(arg) for item in links_and_titles):
await ctx.send(arg + " exists!")
else:
await ctx.send(f'The date you entered: "{arg}" does not have a patch associated with it or that patch expired.')
@bot.command(name='current')
async def current_patch(ctx, *, number=1):
if items:
responses = items[:number]
text = '\n----\n'.join(responses)
await ctx.send(text)
else:
await ctx.send('no patches')
scrape.start()
token = os.getenv('DISCORD_TOKEN')
bot.run(token)