The functionality I want to achieve is to create a new class and import users. Since I don't want the same user to be imported multiple times and a user can join different classes, I have used get_or_create(). However, I have noticed that the program becomes slow when importing a large number of users at once (using CSV to import users).
def post(self, request):
class_data = request.data
class_data["start_time"] = dateutil.parser.parse(class_data["start_time"])
class_data["end_time"] = dateutil.parser.parse(class_data["end_time"])
class_data["created_by"] = request.user
if class_data["end_time"] <= class_data["start_time"]:
return self.error("Start time must occur earlier than end time")
user_data = class_data.pop("users") # Retrieve user data and remove it from the class data
try:
with transaction.atomic():
if Class.objects.filter(title=class_data["title"]).exists():
return self.error("Class with the same title already exists")
class_obj = Class.objects.create(**class_data) # Create the class object
# Add the creator to the class members
if not class_obj.users.filter(id=request.user.id).exists():
class_obj.users.add(request.user)
for data in user_data:
if len(data) != 4 or len(data[1]) > 32:
return self.error(f"Error occurred while processing data '{data}'")
username = data[1]
user, created = User.objects.get_or_create(username=username, defaults={
"password": make_password(data[1]),
"college": data[3],
"student_number": data[1]
})
if created:
profile = UserProfile(user=user, real_name=data[2])
profile.save()
# class_obj.users.add(user)
if not class_obj.users.filter(id=user.id).exists():
class_obj.users.add(user)
return self.success(ClassAdminSerializer(class_obj).data)
except IntegrityError as e:
return self.error(str(e).split("\n")[1])
I have identified three areas that slow down the program. Firstly, the get_or_create() operation takes a long time as it checks for the existence of each user individually. Secondly, the profile.save() operation inserts data one by one, resulting in significant time consumption. Lastly, the class_obj.users.add(user) operation for inserting data into the association table between the Class and User models also takes a considerable amount of time. These are the models for my User and Class:
class User(AbstractBaseUser):
id = models.BigAutoField(primary_key=True)
username = models.TextField(unique=True)
email = models.TextField(blank=True, null=True)
# ADD
college = models.TextField(default=College.COMMUNICATION_COLLEGE)
student_number = models.TextField(null=True)
major = models.TextField(null=True)
create_time = models.DateTimeField(auto_now_add=True, null=True)
# One of UserType
admin_type = models.TextField(default=AdminType.REGULAR_USER)
problem_permission = models.TextField(default=ProblemPermission.NONE)
reset_password_token = models.TextField(null=True)
reset_password_token_expire_time = models.DateTimeField(null=True)
# SSO auth token
auth_token = models.TextField(null=True)
two_factor_auth = models.BooleanField(default=False)
tfa_token = models.TextField(null=True)
session_keys = JSONField(default=list)
# open api key
open_api = models.BooleanField(default=False)
open_api_appkey = models.TextField(null=True)
is_disabled = models.BooleanField(default=False)
USERNAME_FIELD = "username"
REQUIRED_FIELDS = []
objects = UserManager()
def is_admin(self):
return self.admin_type == AdminType.ADMIN
def is_super_admin(self):
return self.admin_type == AdminType.SUPER_ADMIN
def is_admin_role(self):
return self.admin_type in [AdminType.ADMIN, AdminType.SUPER_ADMIN]
def can_mgmt_all_problem(self):
return self.problem_permission == ProblemPermission.ALL
def is_contest_admin(self, contest):
return self.is_authenticated and (contest.created_by == self or self.admin_type == AdminType.SUPER_ADMIN)
def is_Class_admin(self, class_obj):
return self.is_authenticated and (class_obj.created_by == self or self.admin_type == AdminType.SUPER_ADMIN)
class Meta:
db_table = "user"
class Class(models.Model):
title = models.TextField()
start_time = models.DateTimeField()
end_time = models.DateTimeField()
create_time = models.DateTimeField(auto_now_add=True)
# show real time rank or cached rank
real_time_rank = models.BooleanField(default=True)
created_by = models.ForeignKey(User, on_delete=models.CASCADE, related_name='created_classes')
users = models.ManyToManyField(User, related_name='enrolled_classes')
@property
def status(self):
if self.start_time > now():
return ClassStatus.CLASS_NOT_START
elif self.end_time < now():
return ClassStatus.CLASS_ENDED
else:
return ClassStatus.CLASS_UNDERWAY
def problem_details_permission(self, user):
return self.status == ClassStatus.CLASS_ENDED or \
user.is_authenticated and user.is_Class_admin(self) or \
self.real_time_rank
class Meta:
db_table = "Class"
ordering = ("-start_time",)
This is my first time working with Django models, and I'm not sure how to solve this problem. I would like to ask for suggestions on how to modify my code. Could you please advise me on the necessary changes?
First edit: I made the following modifications to the code.
def post(self, request):
class_data = request.data
class_data["start_time"] = dateutil.parser.parse(class_data["start_time"])
class_data["end_time"] = dateutil.parser.parse(class_data["end_time"])
class_data["created_by"] = request.user
if class_data["end_time"] <= class_data["start_time"]:
return self.error("Start time must occur earlier than end time")
user_data = class_data.pop("users") # Retrieve user data and remove it from the class data
try:
with transaction.atomic():
if Class.objects.filter(title=class_data["title"]).exists():
return self.error("Class with the same title already exists")
class_obj = Class.objects.create(**class_data) # Create the class object
# To add the creator to the class members
class_obj.users.add(request.user)
# Bulk create users
# To import users and add them to class
usernames = [data[1] for data in user_data]
# To check if a user already exists
existing_users = User.objects.filter(username__in=usernames)
existing_usernames = set(existing_users.values_list('username', flat=True))
users_to_create = []
users_to_add = []
for data in user_data:
if len(data) != 4 or len(data[1]) > 32:
return self.error(f"Error occurred while processing data '{data}'")
username = data[1]
if username not in existing_usernames:
# If the user does not exist, create a new user
user = User(username=username, password=make_password(data[1]),
college=data[3], student_number=data[1])
users_to_create.append(user)
users_to_add.append(user)
else:
# If the user already exists, add them to the class
user = existing_users.get(username=username)
if not class_obj.users.filter(id=user.id).exists():
users_to_add.append(user)
User.objects.bulk_create(users_to_create)
# Bulk create user profiles
profiles_to_create = [UserProfile(user=user, real_name=data[2]) for user, data in zip(users_to_create, user_data)]
UserProfile.objects.bulk_create(profiles_to_create)
# Bulk add users to class
class_obj.users.add(*users_to_add)
return self.success(ClassAdminSerializer(class_obj).data)
except IntegrityError as e:
return self.error(str(e).split("\n")[1])
I noticed that when I import users and all of them already exist in the database, the class creation process is very fast. However, when most of the users need to be newly created, there is a delay of about one minute between the SELECT and INSERT operations, as shown in the following diagram. I'm not sure what is causing this issue, and I even suspect it might be due to inadequate hardware on my computer. Is there any way to resolve this?
There are several ways in which you can improve this code, but most of the time this will come with trade-offs:
User.objects.filter(username__in=[data[1] for data in user_data]).in_bulk(field_name="username")
. Then you can replace your get_or_create by starting to look in the dict return by this method, and only falling back on creating a new one if it's not in the dictusers = User(username=...) for data in user_data; User.bulk_create(users)
, but it is dangerous because signals will not be sent, as it is the case for a single user creationclass.users.add
accepts multiple objects, so you can store all users in a list (whether they are created or new) and use class_inst.users.add(*users)