scrapyitems

ItemLoader causes start_requests()) TypeError: 'NoneType' object is not iterable


For some reasons, when the ItemLoader is involved in the code it causes this error

start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable

Below is the code for GetTikTokMetricsSpider.py and items.py respectively. The median values work without the ItemLoader process that comes after GetTikTokMetricsSpider.py however not getting through to def get_medians with the ItemLoader. I tried putting the whole process including the ItemLoader in start_requests and it returns the same error. How would the ItemLoader be causing the error here? Here is the code.

GetTikTokMetricsSpider.py:

import scrapy
import json
import csv
import os
import pandas as pd
import numexpr as ne
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path


class GettiktokmetricsSpider(scrapy.Spider):
    name = 'GetTikTokMetricsSpider'
    custom_settings = {
        "FEEDS": {
            "data/metrics.csv": {
                "format": "csv",
                "overwrite": True
            }
        },
        "FEED_EXPORT_FIELDS": [
            "user", "view_med", "like_med", "comment_med", "share_med"
        ],
    }


    def start_requests(self):
        print("START REQUEST")
        users = self.get_users()
        print(users)
        for user in users:
            get_medians = self.get_medians(user)

    def get_medians(self, user):
        print("GET MEDIANS")
        df_counts = self.get_df_counts()
        df_counts.query(f"user == '{user}'", inplace=True)

        df_counts["view_med"] = df_counts["view_count"].median(axis=0)
        df_counts["like_med"] = df_counts["like_count"].median(axis=0)
        df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
        df_counts["share_med"] = df_counts["share_count"].median(axis=0)

        view_med = df_counts["view_med"].iloc[0]
        like_med = df_counts["like_med"].iloc[0]
        comment_med = df_counts["comment_med"].iloc[0]
        share_med = df_counts["share_med"].iloc[0]

        print(user)
        print(view_med)
        print(like_med)
        print(comment_med)
        print(share_med)

        print(type(view_med))
        print(type(like_med))
        print(type(comment_med))
        print(type(share_med)) #Works til here without below il

        il = ItemLoader(item=MedsItem())
        il.add_value("user", user)
        il.add_value("view_med", view_med)
        il.add_value("like_med", like_med)
        il.add_value("comment_med", comment_med)
        il.add_value("share_med", share_med)
        yield il.load_item()
        print(MedsItem())

    def get_users(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        users = df_counts["user"].unique()
        return users

    def get_df_counts(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        return df_counts

    def get_csv_counts_url(self):
        url = f"{get_project_path()}/data/counts.csv"
        return url

items.py:

import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags

def get_count(view):
    view_count = str(view)
    if ("Share" or "share"
        or "Comment" or "comment"
        or "Like" or "like") in view_count:
        view_count = "0"
        return view_count
    if "." in view:
        view_count = view_count.replace(".", "")
    if "K" == view[-1]:
        view_count = view_count.replace("K", "000")
    if "M" == view[-1]:
        view_count = view_count.replace("M", "000000")
    return view_count

def get_med(value):
    if type(value) != str:
        str_value = str(value)
    else:
        return value
    return str_value

class CountsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor = TakeFirst())
    view_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    like_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    comment_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    share_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())

class MedsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    view_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    like_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    comment_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    share_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())

UPDATE SOLVED It seems the error is caused by the general structure of scrapy requiring to yield its Request in start_requests. A simple solution would be to use a random url or file, and then call callback=parse. Here I created an empty html file to minimize the data being passed around, but there's no promise scrapy will not change in the future to refuse and automatically terminate once an empty response is detected, in that case I think adding a couple of texts to the file would be fine, but this seems to solve the problem for now :

import scrapy
import os
import pandas as pd
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path, get_project_file_path


class GettiktokmetricsSpider(scrapy.Spider):
    name = 'GetTikTokMetricsSpider'
    custom_settings = {
        "FEEDS": {
            "data/metrics.csv": {
                "format": "csv",
                "overwrite": True
            }
        },
        "FEED_EXPORT_FIELDS": [
            "user", "view_med", "like_med", "comment_med", "share_med"
        ],
    }


    def start_requests(self):
        create_empty_html = self.create_empty_html()
        empty_html = f"{get_project_file_path()}/data/empty_html.html"
        yield scrapy.Request(empty_html, callback=self.parse)

    def create_empty_html(self):
        empty_html = f"{get_project_path}/data/empty_html.html"
        if os.path.isfile(empty_html) == True:
            pass
        else:
            file = open(f"data/empty_html.html", "w", encoding="utf-8")
            file.write("")
            file.close()

    def parse(self, response):
        users = self.get_users()
        for user in users:
            df_counts = self.get_df_counts()
            df_counts.query(f"user == '{user}'", inplace=True)

            df_counts["view_med"] = df_counts["view_count"].median(axis=0)
            df_counts["like_med"] = df_counts["like_count"].median(axis=0)
            df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
            df_counts["share_med"] = df_counts["share_count"].median(axis=0)

            view_med = df_counts["view_med"].iloc[0]
            like_med = df_counts["like_med"].iloc[0]
            comment_med = df_counts["comment_med"].iloc[0]
            share_med = df_counts["share_med"].iloc[0]

            il = ItemLoader(item=MedsItem())
            il.add_value("user", user)
            il.add_value("view_med", view_med)
            il.add_value("like_med", like_med)
            il.add_value("comment_med", comment_med)
            il.add_value("share_med", share_med)
            yield il.load_item()

    def get_users(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        users = df_counts["user"].unique()
        return users

    def get_df_counts(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        return df_counts

    def get_csv_counts_url(self):
        url = f"{get_project_path()}/data/counts.csv"
        return url

Solution

  • Your start_requests doesn't return or yield at all. So the return value is always going to be NoneType.

    In this line you hand the process over to the get_medians method:

    for user in users:
        get_medians = self.get_medians(user)
    

    And then in the get_medians method you yield the loaded item:

    yield il.load_item()
    print(MedsItem())
    

    So the item is yielded back to the start_requests method and stored in the variable get_medians.

    At this point you should yield the get_medians variable that represents the item. Instead the next iteration of the loop begins and the get_medians variable is overwritten with the next item.

    Simply adding a yield statement to your start requests should solve your problem.

    For example:

    for user in users:
        get_medians = self.get_medians(user)
        yield get_medians
    

    Unfortunately even this will likely lead to errors since scrapy takes the output of start_requests which is expected to be a scrapy.Request object and immediately sends them to the scheduler, and eventually converted into scrapy.Response objects.

    Since scrapy expects a request object from start_requests and it expects items to be yielded from the parse methods you can use just about any request you can think of to get to the parse method and then from there you can execute your code in the parse method.

    for example:

    
    
        def start_requests(self):
            # you can use any url that will successfully create a response
            # object. this one should work though
            yield scrapy.Request(url="https://quotes.toscrape.com")
    
        def parse(self, response):
            print("START REQUEST")
            users = self.get_users()
            print(users)
            for user in users:
                get_medians = self.get_medians(user)
                yield get_medians
    
        ...
        ...