I have the following code. When you understand the code, you can look at the two comments with
the capital letters. I could test if there are new items in the channel with insert or ignore
but
I'm trying the better mechanism with utilization feed.updated_parsed
attribute. Why doesn't it work
as expected?
from __future__ import unicode_literals
import feedparser
from sqlite3 import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket
def getActiveChannels():
"""Returns a list of active RSS channels"""
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("select id, title, xmlurl, updated from channels")
channels = cur.fetchall()
cur.close()
con.close()
return channels
def getItemsForChannel(xmlUrl, lastUpdate):
socket.setdefaulttimeout(60)
feedparserDictionary = feedparser.parse(xmlUrl)
updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
if updatedTime < lst:
return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS
items = feedparserDictionary.entries
print "There are new %d items" % len(items)
return items
def setChannelUpdateTime(xmlUrl, tm):
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
con.commit()
print "updated successfully"
cur.close()
con.close()
if __name__ == "_main__":
con = sqlite.connect(connectionString)
for channel in getActiveChannels():
channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
countOfNewItems = 0
items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)
for item in items:
title, link, description, priority, updated = item
cur = con.cursor()
cur.execute("insert or ignore into feeds \
(title, link, description, read, updated, channelid) \
values (?, ?, ?, ?, ?, ?)", \
(title, link, description, 0, updated, channelId))
countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
con.commit()
cur.close()
if countOfNewItems:
print "Found new items"
now = datetime.now().isoformat()
if "." not in now:
now = now + ".000000"
setChannelUpdateTime(channelXmlUrl, now)
Here are the two tables in sqlite:
CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));
I think the possible error is that you are trying to compare updated
field on the feed, the feeds could be not well supported by the feed creator. Or timezone formatting because of using isoformat or etc.
Anyway, I believe that it is much better to compare PER ENTRY updated
properties rather than comparing the feed property which is mostly used for invalidating feed cache.
Here is a working example, where I return only new entries from the function.
import socket
from datetime import datetime, timedelta
from time import mktime
import feedparser
from pprint import pprint
def getItemsForChannel(xmlUrl, lastUpdate):
lst = datetime.fromisoformat(lastUpdate)
socket.setdefaulttimeout(60)
parsed = feedparser.parse(xmlUrl)
items = [entry for entry in parsed.entries if
datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
print("There are new {} items".format(len(items)))
return items
pprint(getItemsForChannel(
'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
(datetime.now() - timedelta(hours=3)).isoformat()
))
It uses from/to iso formatting for the last parsed date in your database value and compares entries per entry instead of global comparison based on the feed updated
property.