I'm making an xml document for google shopping and had trouble with the encoding of the products description. I've tried different approaches with str_replace
and escaping some characters, utf8_encode
, iconv
and some custom functions, but all generated encoding errors. Then I found someone who used createTextNode
and that seemed to work for me, I got no errors.
The only problem I've got now is that I can't get the createTextNode
in the right namespace (I hope I'm saying that the right way).
This is the code that worked, but failed on some product description due to encoding issues:
$addProduct->appendChild($domtree->createElementNS($xmlns['atom'], 'description', 'test content'));
This creates the right line:
<description>test content</description>
Now I want to use the createTextNode
code, but can't get it working with the right tags. This works:
$addProduct->appendChild($domtree->createTextNode('test content'));
But this just puts the content in my main entry, while it needs to be between the description tags.
How can I put it in the description tags? Or if you know a good way to solve the encoding issues while using my old code, that's fine too.
Here is the entire code I'm using:
function function_xml_entities($text = null, $charset = 'ISO-8859-1'){
$text = htmlentities($text, ENT_COMPAT, $charset, false);
$arr_xml_special_char = array(""","&","'","<",">");
$arr_xml_special_char_regex = "(?";
foreach($arr_xml_special_char as $key => $value){
$arr_xml_special_char_regex .= "(?!$value)";
}
$arr_xml_special_char_regex .= ")";
$pattern = "/$arr_xml_special_char_regex&([a-zA-Z0-9]+;)/";
$replacement = '&${1}';
return preg_replace($pattern, $replacement, $text);
}
function function_html2text($html = null){
$tags = array (
0 => '~<h[123][^>]+>~si',
1 => '~<h[456][^>]+>~si',
2 => '~<table[^>]+>~si',
3 => '~<tr[^>]+>~si',
4 => '~<li[^>]+>~si',
5 => '~<br[^>]+>~si',
6 => '~<p[^>]+>~si',
7 => '~<div[^>]+>~si',
);
$html = preg_replace($tags,"\n",$html);
$html = preg_replace('~</t(d|h)>\s*<t(d|h)[^>]+>~si',' - ',$html);
$html = preg_replace('~<[^>]+>~s','',$html);
// reducing spaces
$html = preg_replace('~ +~s',' ',$html);
$html = preg_replace('~^\s+~m','',$html);
$html = preg_replace('~\s+$~m','',$html);
// reducing newlines
$html = preg_replace('~\n+~s',"\n",$html);
return $html;
}
$sql_products = "QUERY WHICH IS NOT RELEVANT";
$result_products = mysql_query($sql_products);
//create a dom document with encoding utf8
$domtree = new DOMDocument('1.0', 'UTF-8');
//create the root element of the xml tree
$xmlRoot = $domtree->createElement("feed");
$xmlRoot = $domtree->appendChild($xmlRoot);
//create a dom document with encoding utf8
$domtree = new DOMDocument('1.0', 'UTF-8');
//create the root element of the xml tree
$xmlns = array('atom' => 'http://www.w3.org/2005/Atom','g' =>'http://base.google.com/ns/1.0');
$xmlRoot = $domtree->appendChild($domtree->createElementNS($xmlns['atom'], 'feed'));
$xmlRoot->setAttributeNS($xmlns['g'], 'g:dummy', ''); //add a dummy attribute to add the google namespace to the document element
$xmlRoot->removeAttribute('g:dummy'); //remove dummy attribute
//Standard things like title
$xmlRoot->appendChild($domtree->createElement('title', 'title'));
$link = $xmlRoot->appendChild($domtree->createElement('link'));
$link->setAttribute('rel', 'self');
$link->setAttribute('href', $global_websitenaam_include);
$xmlRoot->appendChild($domtree->createElement('updated', date('Y-m-d H:i:s')));
$addAuthor = $xmlRoot->appendChild($domtree->createElement("author"));
$addAuthor->appendChild($domtree->createElement('name', 'author name'));
$xmlRoot->appendChild($domtree->createElement('id', 'tag:website.com,'.date('Y-m-d')));
//Producten doorlopen
while($product = mysql_fetch_assoc($result_products)){
//HERE ARE OTHER QUERIES AND DEFINING VARIABLES WHICH AREN'T RELEVANT TO THE CODE
$product_content = function_xml_entities(substr_replace(str_replace(' ',' ', function_html2text($product['content'])), "", 5000));
// create the products
$addProduct = $xmlRoot->appendChild($domtree->createElementNS($xmlns['atom'], "entry"));
$addProduct->appendChild($domtree->createElementNS($xmlns['atom'], 'id', $product['id']));
$addProduct->appendChild($domtree->createElementNS($xmlns['atom'], 'title', substr_replace($product['name'], "", 150)));
$linkProd = $addProduct->appendChild($domtree->createElement('link'));
$linkProd->setAttribute('href', $global_websitenaam_include.'/'.rawurlencode($product['category_slug']).'/'.rawurlencode($product['slug']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:price', number_format($product_price, 2, ',', '.')));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:condition', $condition_product));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:brand', substr_replace($product['manufacturer_name'], "", 70)));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:mpn', $product['typenumber']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:ean', $product['ean']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:image_link', $product_image));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:product_type', 'Huis & Tuin > '.$parentcategory_name.$product['category_name']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:availability', $product_stock));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:manufacturer', $product['supplier_name']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:weight', $product['weight']));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:featured_product', $product_advertisement));
$addProduct->appendChild($domtree->createElementNS($xmlns['g'], 'g:size', $product['size']));
$addProductShipping = $addProduct->appendChild($domtree->createElement("g:shipping"));
$addProductShipping->appendChild($domtree->createElement('g:country', 'NL'));
$addProductShipping->appendChild($domtree->createElement('g:service', 'Standaard'));
$addProductShipping->appendChild($domtree->createElement('g:price', number_format($shipment_price, 2, ',', '.')));
$addProduct->appendChild($domtree->createElementNS($xmlns['atom'], 'description', $product_content));
//$addProduct->appendChild($domtree->createTextNode($product_content));
}
//get the xml printed
header("content-type: text/xml; charset: utf-8");
$domtree->formatOutput = true;
echo $domtree->saveXML();
Character nodes nodes do not have a namespace. Here are two types of them. Text nodes, which encode the XML special chars and CDATA sections, which use a special syntax. Both can be used for the atom:description
and atom:summary
. The expected content (for the Atom Parser) depends on the type
attribute.
Default is just text
, html
means that it expects a HTML fragment encoded as text, xhtml
are child nodes in the XHTML namespace.
You should not use the content argument of createElement()/createElementNS()
or set the $nodeValue property, except if your are sure here are no special characters in the value (empty string, integers, ...). They use a broken encoding. Create the character nodes using DOMDocument::createTextNode()
or DOMDocument::createCDATASection()
.
Here is a small example:
$xmlns = [
'atom' => 'http://www.w3.org/2005/Atom'
];
$htmlFragment = '<div>Description HTML Fragment</div>';
$document = new DOMDocument();
$entry = $document->appendChild(
$document->createElementNS($xmlns['atom'], 'entry')
);
$summary = $entry->appendChild(
$document->createElementNS($xmlns['atom'], 'summary')
);
$summary->setAttribute('type', 'text');
$summary->appendChild(
$document->createTextNode('Summary Text')
);
$description = $entry->appendChild(
$document->createElementNS($xmlns['atom'], 'description')
);
$description->setAttribute('type', 'html');
$description->appendChild(
$document->createCDATASection($htmlFragment)
);
$document->formatOutput = TRUE;
echo $document->saveXml();
Output:
<?xml version="1.0"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<summary type="text">Summary Text</summary>
<description type="html"><![CDATA[<div>Description HTML Fragment</div>]]></description>
</entry>