Skip to content

Commit 01717c8

Browse files
committed
[KleinanzeigenBridge] random improvements and fixes
Previous MR #4820 introduced a bug where the URI wasn't getting expanded. This is because it is obtained from a non-standard data-uri attribute which defaultLinkTo() doesn't support. On top of that: - sanitizes the HTML in Content - use a longer Description found in JSON - fix timestamp processing, including for relative Today and Yesterday strings - move media to enclousures - be explicit about elements chosen to augument the description - simplify the image URL processing
1 parent ced9e56 commit 01717c8

File tree

1 file changed

+50
-17
lines changed

1 file changed

+50
-17
lines changed

bridges/KleinanzeigenBridge.php

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ public function collectData()
9191
if ($this->queriedContext === 'By profile') {
9292
for ($i = 1; $i <= $this->getInput('pages'); $i++) {
9393
$html = getSimpleHTMLDOM($this->getURI() . '/s-bestandsliste.html?userId=' . $this->getInput('userid') . '&pageNum=' . $i . '&sortingField=SORTING_DATE');
94-
$html = defaultLinkTo($html, $this->getURI());
9594

9695
$foundItem = false;
9796
foreach ($html->find('article.aditem') as $element) {
@@ -120,7 +119,6 @@ public function collectData()
120119
]);
121120

122121
$html = getSimpleHTMLDOM($searchUrl);
123-
$html = defaultLinkTo($html, $this->getURI());
124122

125123
// end of list if returned page is not the expected one
126124
if ($html->find('.pagination-current', 0)->plaintext != $page) {
@@ -138,22 +136,57 @@ private function addItem($element)
138136
{
139137
$item = [];
140138

139+
$item['content'] = '';
140+
141+
$json = $element->find('.aditem-image > script', 0);
142+
if ($json) {
143+
$data = json_decode($json->innertext, true);
144+
$item['title'] = $data['title'];
145+
$item['content'] .= '<div><p>' . $data['description'] . '</div></p></br>';
146+
}
147+
else {
148+
$item['title'] = $element->find('h2', 0)->plaintext;
149+
$item['content'] .= $element->find('.aditem-main--middle--description');
150+
}
151+
152+
if ($element->find('.aditem-main--top', 0)) {
153+
$item['content'] .= $element->find('.aditem-main--top', 0);
154+
}
155+
156+
if ($element->find('.aditem-main--middle--price-shipping', 0)) {
157+
$item['content'] .= preg_replace(
158+
'#(<p\s+class="aditem-main--middle--price-shipping--old-price"[^>]*>.*?</p>)#si',
159+
'<s>$1</s>',
160+
$element->find('.aditem-main--middle--price-shipping', 0)
161+
);
162+
}
163+
164+
if ($element->find('.aditem-main--bottom', 0)) {
165+
$item['content'] .= $element->find('.aditem-main--bottom', 0);
166+
}
167+
168+
$item['content'] = sanitize($item['content']);
169+
141170
$item['uid'] = $element->getAttribute('data-adid');
142-
$item['uri'] = $element->getAttribute('data-href');
143-
144-
$item['title'] = $element->find('h2', 0)->plaintext;
145-
$item['timestamp'] = $element->find('div.aditem-main--top--right', 0)->plaintext;
146-
$imgUrl = str_replace(
147-
'rule=$_2.JPG',
148-
'rule=$_57.JPG',
149-
str_replace(
150-
'rule=$_35.JPG',
151-
'rule=$_57.JPG',
152-
$element->find('img', 0) ? $element->find('img', 0)->getAttribute('src') : ''
153-
)
154-
); //enhance img quality
155-
156-
$item['content'] = '<img src="' . $imgUrl . '"/>' . $element->find('div.aditem-main', 0)->outertext;
171+
$item['uri'] = urljoin($this->getURI(), $element->getAttribute('data-href'));
172+
173+
$dateString = trim($element->find('div.aditem-main--top--right', 0)->plaintext);
174+
if ($dateString) {
175+
$dateString = str_ireplace(
176+
['Gestern', 'Heute'],
177+
['yesterday', 'today'],
178+
$dateString);
179+
180+
$item['timestamp'] = strtotime($dateString);
181+
}
182+
else {
183+
$item['timestamp'] = time();
184+
}
185+
186+
if ($element->find('img', 0)) {
187+
//enhance img quality. Cannot use convertLazyLoading() here due to non-standard URI suffix in srcset.
188+
$item['enclosures'] = [preg_replace('/rule=\$_\d+\.AUTO/i', 'rule=$_57.AUTO', $element->find('img', 0)->getAttribute('src')) . '#.image'];
189+
};
157190

158191
$this->items[] = $item;
159192
}

0 commit comments

Comments
 (0)