Хабр Курсы для всех
РЕКЛАМА
Практикум, Хекслет, SkyPro, авторские курсы — собрали всех и попросили скидки. Осталось выбрать!
А как парсили пикабу?
import requests,csv
from bs4 import BeautifulSoup
#url='https://pikabu.ru'
#soup = BeautifulSoup(response.text, 'lxml')
from datetime import datetime
from multiprocessing import Pool
from threading import Thread
import pickle
start = datetime.now()
def get_html(url):
with requests.Session() as session:
session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
session.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
r=session.get(url)
if r.status_code==404:
return ('404')
else:
#r = requests.get(url)
return r.text
def write_csv(data,filename):
with open(filename,'a',encoding='utf8',newline='') as f:
f.write(f"{data['story_title']}\t"
f"{data['link']}\t"
f"{data['story_id']}\t"
f"{data['data_rating']}\t"
f"{data['data_timestamp']}\t"
f"{data['story_comments']}\t"
f"{data['data_author_id']}\t"
f"{data['data_meta_rating']}\t"
f"{data['story__views_hint']}\t"
f"{data['user_name']}\t"
f"{data['user_link']}\t"
f"{data['story__community_link']}\n")
def get_page_data(text,filename):
soup=BeautifulSoup(text,'lxml')
ads = soup.find_all('article',class_='story')
for ad in ads:
try:
if ad.find('h2',class_='story__title')==None:
story_title=ad.find('h1',class_='story__title').text.strip()
else:
story_title=ad.find('h2',class_='story__title').text.strip()
except:
story_title=''
try:
link=ad.find('a')['href']
except:
link=''
try:
story_id=ad['data-story-id']
except:
story_id=''
try:
data_rating=ad['data-rating']
except:
data_rating=''
try:
data_timestamp=ad.find('time',class_='caption story__datetime hint')['datetime'][:10]
except:
data_timestamp=''
try:
story_comments=ad.find(class_='story__comments-link-count').text
except:
story_comments=''
try:
data_author_id=ad['data-author-id']
except:
data_author_id=''
try:
data_meta_rating=ad['data-meta-rating']
except:
data_meta_rating=''
try:
story__views_hint=ad.find('div',class_='story__views hint')['aria-label'].split(' ')[0]
except:
story__views_hint=''
try:
user_name=ad.find('div',class_='user__info-item').text.strip()
except:
user__name=''
try:
user_link=ad.find('div',class_='user__info-item').find('a')['href']
except:
user_link=''
try:
story__community_link=ad.find(class_='story__community-link')['href']
except:
story__community_link=''
#print(story_title,link,story_id,data_rating,data_timestamp,data_vkey,data_author_id,data_meta_rating,story__views_hint,user_name,user_link)
data={'story_title':story_title,
'link':link,
'story_id':story_id,
'data_rating':data_rating,
'data_timestamp':data_timestamp,
'story_comments':story_comments,
'data_author_id': data_author_id,
'data_meta_rating':data_meta_rating,
'story__views_hint':story__views_hint,
'user_name':user_name,
'user_link':user_link,
'story__community_link':story__community_link
}
write_csv(data,filename)
def make_all(urls,filename):
for i in range(len(urls)):
#print(urls[i])
text = get_html(urls[i])
if text=='404':
pass
else:
get_page_data(text,filename)
def main():
#url = 'https://pikabu.ru/?page={}'
#url='https://pikabu.ru/best/01-01-2020_17-08-2020?page={}'
url='https://pikabu.ru/story/_{}'
with open('lost-articles.pickle', 'rb') as f:#571 120
u = pickle.load(f)
#print(len(urls))
urls1 = [url.format(str(i)) for i in u[50001:60000]]
urls2 = [url.format(str(i)) for i in u[60001:70000]]
urls3 = [url.format(str(i)) for i in u[70001:80000]]
urls4 = [url.format(str(i)) for i in u[80001:90000]]
urls5 = [url.format(str(i)) for i in u[90001:100000]]
#with Pool(3) as p:
# p.map(make_all, urls)
thread1 = Thread(target=make_all, args=(urls1,'f1.txt',))
thread2 = Thread(target=make_all, args=(urls2,'f2.txt',))
thread3 = Thread(target=make_all, args=(urls3,'f3.txt',))
thread4 = Thread(target=make_all, args=(urls4,'f4.txt',))
thread5 = Thread(target=make_all, args=(urls4,'f5.txt',))
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread1.join()
thread2.join()
thread3.join()
thread4.join()
thread5.join()
if __name__ == '__main__':
main()Спасибо! Взял для себя то, что раньше не понимал. У них есть длинная и короткая ссылка.
Не понимал как перебирать длинные ссылки, а есть просто короткая ссылка с цифрой.
https://pikabu.ru/story/realistichnost_12343647
https://pikabu.ru/story/_12343647
Pikabu-dataset