- 01
- 02
- 03
- 04
- 05
- 06
- 07
- 08
- 09
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
import csv
import json
import requests
from io import StringIO
from lxml import etree
class DayPicParser(object):
url = 'урл_до_сайта'
def get_info(self):
handler = open('daypic.csv', 'w+')
page = 1
rows = []
tags = set()
while page <= 1077:
response = requests.get('{}/page/{}'.format(self.url, page))
response.connection.close()
parser = etree.HTMLParser()
tree = etree.parse(StringIO(response.text), parser)
posts = tree.xpath('//div[@class="posts"]/div[@class="post"]')
for i, post in enumerate(posts):
title = post.find('./h2/a')
link_to_fulltext = title.attrib['href']
short_text = post.find('./div[@class="text"]//p[1]')
response = requests.get(link_to_fulltext)
full_parser = etree.HTMLParser()
full_tree = etree.parse(StringIO(response.text), full_parser)
full_post = full_tree.xpath('//div[@class="text"]//p')
main_image = None
full_text = []
for inner in full_post:
if inner.find('.//noindex') is not None:
continue
if inner.find('.//img') is not None:
src = inner.find('.//img').attrib['src']
if not main_image:
main_image = src
full_text.append({'image': src})
for paragraph in etree.tostring(inner, method="text", encoding='utf-8').decode('utf-8').strip().split("\n"):
if paragraph:
full_text.append({'text': paragraph})
rows.append({'title': title.text.strip(),
'preview': main_image if main_image else '',
'short_text': short_text.text.strip(),
'full_text': full_text})
print('Post {} of {}'.format(i, len(posts)))
page += 1
print(page)
handler.write(json.dumps(rows))
handler.close()
print(tags)
DayPicParser().get_info()
Her 10.07.2017 18:25 # 0
storvus 10.07.2017 18:28 # 0
ProgRamistYshka 20.07.2017 17:35 # 0
storvus 21.07.2017 09:34 # 0
syoma 26.12.2017 04:27 # 0
alex_matviichuk 25.08.2021 19:24 # 0