这篇之前被关小黑屋啦,更新移步 http://www.lukou.com/userfeed/17240148
=========================
[西瓜]不会用python的看这里http://www.lukou.com/userfeed/19864317
[西瓜]能爬到的最早日期:
精选团 http://www.lukou.com/topic/2?tagid=446 2015年5月23日 ,再之前的路况长相不太一样,懒得改code了
自由团 http://www.lukou.com/topic/2?tagid=447 2015年6月23日,再之前没有路况
爬虫只爬了现在在以上两个链接里的团贴,删掉的或者不在里面的我也没办法[鹿笑哭]
我偶像的黑心团长指路帖 http://share.lukou.com/sharefeed/12885645
[西瓜]自娱自乐的结果。
开团数/39个月,平均每个月开两三个团的也是棒棒了[鹿滑稽脸]
[西瓜]原始数据长这样,大家有什么想看的可以一起研究下[鹿doge脸]
=====================Python 3.6的分割线=====================
# -*- coding: UTF-8 -*-
# aali
from bs4 import BeautifulSoup
import requests
import json
import csv,codecs
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver import Remote
from selenium.webdriver.chrome import options
from selenium.common.exceptions import InvalidArgumentException
#http://www.lukou.com/aali
class ReuseChrome(Remote):
def __init__(self, command_executor, session_id):
self.r_session_id = session_id
Remote.__init__(self, command_executor=command_executor, desired_capabilities={})
def start_session(self, capabilities, browser_profile=None):
if not isinstance(capabilities, dict):
raise InvalidArgumentException("Capabilities must be a dictionary")
if browser_profile:
if "moz:firefoxOptions" in capabilities:
capabilities["moz:firefoxOptions"]["profile"] = browser_profile.encoded
else:
capabilities.update({'firefox_profile': browser_profile.encoded})
self.capabilities = options.Options().to_capabilities()
self.session_id = self.r_session_id
self.w3c = False
def get_commit(browser, commit_url, fname, page=135):
with codecs.open(fname + '.csv', 'w', encoding='utf_8_sig') as f:
f.write('标题, 原po,时间,参团人数,评论,点赞,收藏' + '\n' + '\n')
for p in range(page):
uri = commit_url + str(p)
print (uri)
browser.get(uri)
SCROLL_PAUSE_TIME = 3
for i in range(5):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
ActionChains(browser).key_down(Keys.END).key_up(Keys.END).perform()
time.sleep(SCROLL_PAUSE_TIME)
soup = BeautifulSoup(browser.page_source,'lxml')
feeds = soup.find_all('div', class_="feed-wrap")
print (len(feeds))
for i in range(len(feeds)):
try:
title = '' + feeds[i].find('div', class_="title").a.get_text(', ')
author = ', ' + feeds[i].find('div', class_="author").a.get_text(', ')
lTime = ', ' + feeds[i].find('div', class_="author").findAll('a')[1].get_text(', ')
tuanppl= feeds[i].find('div', class_="tuan-people").get_text()
tuanppl=', ' + tuanppl[5:-1]
comment= feeds[i].find('a', class_="feed-comment").get_text()
if comment.isdigit():
comment=', ' + comment
else:
comment = ', '
praize = feeds[i].find('a', class_="feed-praize").get_text().strip()
if praize.isdigit():
praize = ', ' + praize
else:
praize = ', '
collect = feeds[i].find('a', class_="feed-collect").get_text().strip()
if collect.isdigit():
collect = ', ' + collect
else:
collect = ', '
with open(fname + '.csv', 'a', encoding='utf_8_sig') as f:
f.write(title + author + lTime + tuanppl + comment + praize + collect + '\n')
f.close()
except:
pass
#browser.quit()
time.sleep(1)
if __name__ == '__main__':
uri = 'http://www.lukou.com'
url= 'http://www.lukou.com/topic/2?tagid=447'
fname = input('save as: ')
commit_url = url + '&page='
driver = webdriver.Chrome(executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
executor_url = 'http://127.0.0.1:50367'
session_id = '48f19d231d9e8a850164497c40e0e63c'
#browser = webdriver.Chrome(executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') # Get local session of chrome
browser = ReuseChrome(command_executor=executor_url, session_id=session_id)
get_commit(browser, commit_url, fname)
=====================Python 3.6的分割线=====================
[西瓜]路口现在也有反爬了,不登陆不给爬,但是我又懒得搞一个webdriver模拟登陆。下面这个是开了webdriver之后手动登陆,然后取executor_url 和session_id 用在主程序的。懒人的workaround
=====================Python 3.6的分割线=====================
# -*- coding: UTF-8 -*-
# aali
from bs4 import BeautifulSoup
import requests
import json
import csv,codecs
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import Remote
from selenium.webdriver.chrome import options
from selenium.common.exceptions import InvalidArgumentException
import time
if __name__ == '__main__':
driver = webdriver.Chrome(executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
executor_url = driver.command_executor._url
session_id = driver.session_id
print(session_id)
print(executor_url)
driver.get("http://www.lukou.com/circle")
time.sleep(10)