pythondjangowagtailwagtail-streamfieldwagtail-apiv2

Wagtail publish images programatically doesn't reflect in FE


Created a management command to import content from Headway app to wagtail. Since headwayapp doesn't have any api's i copied the html to a separate file and parsed it. Everything works fine apart from images, Unless i click publish to that specific page in admin dashboard images doesn't reflect. All other content seems to work fine. Here is the management command Any help would be appreciated!

from django.core.management.base import BaseCommand
from wagtail.models import Page
from wagtail.images.models import Image
from wagtail.embeds.models import Embed
from home.models import ChangelogPost, Category, Product
from bs4 import BeautifulSoup
from datetime import datetime
import uuid
from io import BytesIO
import requests
import re
from django.core.files.images import ImageFile


class Command(BaseCommand):
    help = "Import HTML content into Wagtail as ChangelogPost objects."

    def handle(self, *args, **kwargs):

        with open("subscription/migrateHTML/migrate.html", "r", encoding='utf-8') as f:
            html_doc = f.read()

        soup = BeautifulSoup(html_doc, 'html.parser')

        content = []
        blog_divs = soup.find_all('div', class_='changelogItem published')

        for div in blog_divs:

            h2 = div.find('h2')
            content_div = div.find('div', class_='content')
            p = content_div.find('p')
            img_tags = content_div.find_all('img')
            img_htmls = [img.prettify() for img in img_tags]

            h3 = content_div.find('h3')
            if h3:
                span_text = h3.text.strip()

            # Get the content html, ignoring the img tag

            to_decompose = []

            for child in p.next_siblings:
                soup = BeautifulSoup(str(child), 'html.parser')
                if soup.find('img') is not None:
                    to_decompose.append(child)

            for child in to_decompose:
                child.decompose()

            # Get the content html, ignoring the span tag
            content_html = ''.join(
                str(sibling) for sibling in p.next_siblings if sibling.name != 'span')

            # include the first p tag in the content
            content_html = str(p) + content_html

            # Extract date_published from time tag
            date_published = div.find(
                'time')['datetime'] if div.find('time') else None

            if date_published:
                # Convert date_published to the correct format
                date_published = datetime.strptime(
                    date_published, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")

            if h2:
                # Add the span text as the fourth value
                content.append(
                    [h2.text.strip(), content_html, img_htmls, span_text, date_published])

        parent_page = Page.objects.filter(slug="home")[0]
        categories = Category.objects.all()
        product = Product.objects.all().first()

        for index in range(len(content)):

            # Generate a random UUID
            unique_id = uuid.uuid4()
            unique_id_str = str(unique_id).replace('-', '')[:10]

            val = content[index]

            match = [category for category in categories if val[3]
                     == category.name]

            img_arr = []
            if val[2] is not None:

                for item in val[2]:

                    img_src_regex = r'src="([^"]*)"'
                    src = re.search(img_src_regex, item).group(1)

                    if not src.startswith('https://'):
                        src = 'https:'+src

                    http_res = requests.get(src)
                    title = str(uuid.uuid4().int)[:6] + '.jpg'
                    image = Image(title=title, file=ImageFile(
                        BytesIO(http_res.content), name=title))
                    image.save()

                    imgstr = f"""<embed embedtype='image' id="{
                        image.id}" format='fullwidth' alt='happyfox images'/>"""
                    img_arr.append(imgstr)

            if img_arr:
                for im in img_arr:
                    val[1] += im

            page = ChangelogPost(
                title=val[0],
                content=f"""{val[1]}""",
                slug=unique_id_str,
                published_date=val[4],
                categories=match,
                products=[product]
            )

            new_page = parent_page.add_child(instance=page)
            new_page.save_revision().publish()

        self.stdout.write(
            self.style.SUCCESS(
                f"Migration successfully completed"
            )
        )

I tried to publish new_page.save_revision().publish() with this command but images are not rendering. Ideally image should render.

EDIT : Found the issue, embed tag attributes should be in double quotes rather than single. As per docs


Solution

  • When your import script writes an <embed> tag to the rich text field, the attributes need to be quoted with double quotes, not single quotes, as per the rich text data format docs:

            imgstr = f"""<embed embedtype="image" id="{
                image.id}" format="fullwidth" alt="happyfox images" />"""
    

    This is because the rich text handling code replaces the <embed> tag using a regexp - the format of the tag must exactly match the one used by Wagtail itself.